| import gradio as gr |
| from huggingface_hub import hf_hub_download |
| from llama_cpp import Llama |
|
|
| |
| model_path = hf_hub_download( |
| repo_id="VibeStudio/Nidum-Llama-3.2-3B-Uncensored-GGUF", |
| filename="Nidum-Llama-3.2-3B-Uncensored-GGUF.gguf" |
| ) |
|
|
| |
| llm = Llama( |
| model_path=model_path, |
| n_ctx=8192, |
| n_batch=128, |
| n_gpu_layers=0, |
| verbose=False |
| ) |
|
|
| |
| def format_prompt(message, history): |
| prompt = "<|begin_of_text|>" |
| for user, assistant in history: |
| prompt += f"<|start_header_id|>user<|end_header_id|>\n{user}<|eot_id|>" |
| prompt += f"<|start_header_id|>assistant<|end_header_id|>\n{assistant}<|eot_id|>" |
| prompt += f"<|start_header_id|>user<|end_header_id|>\n{message}<|eot_id|>" |
| prompt += "<|start_header_id|>assistant<|end_header_id|>\n" |
| return prompt |
|
|
| def chat(message, history): |
| prompt = format_prompt(message, history) |
| output = llm( |
| prompt, |
| max_tokens=2048, |
| temperature=0.7, |
| top_p=0.95, |
| repeat_penalty=1.15 |
| |
| ) |
| return output["choices"][0]["text"].strip() |
|
|
| |
| gr.ChatInterface( |
| chat, |
| title="Llama 3.2 3B Uncensored (CPU Básica)", |
| description="IA sem censura rodando só em CPU gratuita! Respostas em ~10-20s." |
| ).launch() |
|
|
|
|