rr19tech commited on
Commit
c154452
·
verified ·
1 Parent(s): 706db59
Files changed (1) hide show
  1. app.py +3 -24
app.py CHANGED
@@ -1,34 +1,13 @@
1
  import gradio as gr
2
  from transformers import pipeline, AutoTokenizer
3
- from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
4
 
5
 
6
- #model_id = "HuggingFaceTB/SmolLM2-135M-Instruct"
7
- #we will be using Qwen2.5-0.5B-Instruct model is a lightweight powerhouse
8
  model_id = "Qwen/Qwen2.5-0.5B-Instruct"
9
- quantized_model_dir = "qwen2.5-0.5b-gptq-4bit"
10
 
11
- quantize_config = BaseQuantizeConfig(
12
- bits=4, # Quantize to 4-bit
13
- group_size=128, # Recommended setting
14
- desc_act=False # Set to False for faster inference
15
- )
16
- tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)
17
- #model = AutoGPTQForCausalLM.from_pretrained(model_id, quantize_config)
18
- model = AutoGPTQForCausalLM.from_pretrained(
19
- model_id,
20
- device_map="cpu",
21
- use_cuda_fp16=False # Critical for CPU-only environments
22
- )
23
 
24
- #pipe = pipeline("text-generation", model=model_id, device_map="auto") #adding a auto detect gpu
25
- #tokenizer = AutoTokenizer.from_pretrained(model_id)
26
- pipe = pipeline(
27
- "text-generation",
28
- model=model,
29
- tokenizer=tokenizer,
30
- device_map="cpu"
31
- )
32
 
33
 
34
  def chat(message, history):
 
1
  import gradio as gr
2
  from transformers import pipeline, AutoTokenizer
 
3
 
4
 
 
 
5
  model_id = "Qwen/Qwen2.5-0.5B-Instruct"
 
6
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
+ pipe = pipeline("text-generation", model=model_id, device_map="auto") #adding a auto detect gpu
9
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
10
+
 
 
 
 
 
11
 
12
 
13
  def chat(message, history):