Spaces:

Eeppa
/

VerySlowAISmartedition

Running

App Files Files Community

VerySlowAISmartedition / app.py

Eeppa

Update app.py

9fbbe42 verified 11 days ago

raw

history blame contribute delete

2.11 kB

	import gradio as gr
	from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
	import torch
	from threading import Thread

	# Model ID for DeepSeek-R1-Distill-Qwen-1.5B
	model_id = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"

	# Load tokenizer and model
	# Using bfloat16 to save 50% RAM and avoid crashes
	tokenizer = AutoTokenizer.from_pretrained(model_id)
	model = AutoModelForCausalLM.from_pretrained(
	model_id,
	torch_dtype=torch.bfloat16,
	device_map="auto",
	low_cpu_mem_usage=True
	)

	def generate_response(message, history):
	# System prompt to keep the model focused
	system_prompt = "You are DeepSeek-R1, a helpful assistant. Use the <think> tags to show your reasoning."

	# Build conversation with history
	messages = [{"role": "system", "content": system_prompt}]
	for user_msg, assistant_msg in history:
	messages.append({"role": "user", "content": user_msg})
	messages.append({"role": "assistant", "content": assistant_msg})
	messages.append({"role": "user", "content": message})

	# Prepare the input
	input_text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
	inputs = tokenizer([input_text], return_tensors="pt").to(model.device)

	# Setup streaming
	streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

	generation_kwargs = dict(
	inputs,
	streamer=streamer,
	max_new_tokens=512,
	temperature=0.6,
	repetition_penalty=1.1,
	do_sample=True
	)

	# Run in a thread so the UI doesn't freeze
	thread = Thread(target=model.generate, kwargs=generation_kwargs)
	thread.start()

	partial_text = ""
	for new_text in streamer:
	partial_text += new_text
	yield partial_text

	# Create the Interface (No 'theme' argument to avoid Gradio 6 errors)
	demo = gr.ChatInterface(
	fn=generate_response,
	title="DeepSeek-R1 (1.5B) - Smart Slow AI",
	description="Streaming enabled. Watch it think!"
	)

	if __name__ == "__main__":
	demo.launch()