| import gradio as gr |
| from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline |
| import torch |
|
|
| |
| model_id = "PowerInfer/SmallThinker-21BA3B-Instruct" |
|
|
| tokenizer = AutoTokenizer.from_pretrained( |
| model_id, |
| trust_remote_code=True |
| ) |
|
|
| model = AutoModelForCausalLM.from_pretrained( |
| model_id, |
| device_map="cpu", |
| torch_dtype=torch.float32, |
| trust_remote_code=True |
| ) |
|
|
| |
| generator = pipeline( |
| "text-generation", |
| model=model, |
| tokenizer=tokenizer, |
| device=-1 |
| ) |
|
|
| |
| def chat(prompt, max_new_tokens=256, temperature=0.7): |
| output = generator( |
| prompt, |
| max_new_tokens=max_new_tokens, |
| temperature=temperature, |
| do_sample=True, |
| pad_token_id=tokenizer.eos_token_id |
| ) |
| return output[0]["generated_text"] |
|
|
| |
| gr.Interface( |
| fn=chat, |
| inputs=[ |
| gr.Textbox(label="Prompt", lines=4, placeholder="Ask anything..."), |
| gr.Slider(32, 512, value=256, step=16, label="Max New Tokens"), |
| gr.Slider(0.1, 1.5, value=0.7, step=0.1, label="Temperature") |
| ], |
| outputs=gr.Textbox(label="Response"), |
| title="💬 SmallThinker-21BA3B-Instruct", |
| description="Run PowerInfer/SmallThinker-21BA3B-Instruct locally on CPU using Hugging Face + Gradio" |
| ).launch() |
|
|