Lewis7887 commited on
Commit
95fbd1e
·
verified ·
1 Parent(s): df5e808

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +67 -0
app.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import spaces
3
+ import torch
4
+ from transformers import AutoModelForCausalLM, AutoTokenizer
5
+
6
+ # The exact name of the model on Hugging Face
7
+ model_id = "Qwen/Qwen2.5-7B-Instruct"
8
+
9
+ # 1. Load the Tokenizer
10
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
11
+
12
+ # 2. Load the Model
13
+ # We use bfloat16 to compress it slightly so it fits perfectly in the free ZeroGPU memory
14
+ model = AutoModelForCausalLM.from_pretrained(
15
+ model_id, torch_dtype=torch.bfloat16, device_map="auto"
16
+ )
17
+
18
+
19
+ # 3. The Generation Function
20
+ # The @spaces.GPU decorator is the magic word that gives you free GPU access
21
+ @spaces.GPU
22
+ def generate_response(message, history):
23
+ # Format the ongoing conversation history for Qwen
24
+ messages = []
25
+ for user_msg, bot_msg in history:
26
+ messages.append({"role": "user", "content": user_msg})
27
+ messages.append({"role": "assistant", "content": bot_msg})
28
+
29
+ # Add the newest message
30
+ messages.append({"role": "user", "content": message})
31
+
32
+ # Apply Qwen's specific chat template
33
+ text = tokenizer.apply_chat_template(
34
+ messages, tokenize=False, add_generation_prompt=True
35
+ )
36
+
37
+ # Convert text to tokens and send to the GPU
38
+ model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
39
+
40
+ # Generate the response
41
+ generated_ids = model.generate(
42
+ **model_inputs,
43
+ max_new_tokens=512, # Maximum length of the response
44
+ temperature=0.7, # Creativity (0.0 is robotic, 1.0 is highly creative)
45
+ )
46
+
47
+ # Strip away the prompt so we only display the new answer
48
+ generated_ids = [
49
+ output_ids[len(input_ids) :]
50
+ for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
51
+ ]
52
+
53
+ # Decode tokens back into readable text
54
+ response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
55
+ return response
56
+
57
+
58
+ # 4. Build the Web Interface
59
+ demo = gr.ChatInterface(
60
+ fn=generate_response,
61
+ title="My Qwen 2.5 Chatbot",
62
+ description="Running entirely for free using Hugging Face ZeroGPU.",
63
+ )
64
+
65
+ # 5. Launch the app
66
+ if __name__ == "__main__":
67
+ demo.launch()