nova commited on
Commit
1c4aa85
·
verified ·
1 Parent(s): 428206d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +19 -20
app.py CHANGED
@@ -1,51 +1,50 @@
1
- mport gradio as gr
2
  from llama_cpp import Llama
3
  from huggingface_hub import hf_hub_download
4
- # PRO: Qwen 2.5 14B Instruct (GGUF) - Q4_K_M (Best Balance)
5
- # Fits in 16GB RAM (Free Tier T4 GPU or CPU Upgrade)
6
- # Size: ~9GB Model + Context
7
- REPO_ID = "Qwen/Qwen2.5-14B-Instruct-GGUF"
8
- FILENAME = "qwen2.5-14b-instruct-q4_k_m.gguf"
9
- print(f"Downloading {FILENAME} from {REPO_ID}...")
10
- # Use 'main' or specific quantization repo if needed
11
- # Actually Qwen official usually puts GGUF in separate repo or subfolder.
12
- # Let's use a reliable GGUF mirror/quantizer if official is confusing, but Bartowski is standard.
13
- # REPO: bartowski/Qwen2.5-14B-Instruct-GGUF
14
  REPO_ID = "bartowski/Qwen2.5-14B-Instruct-GGUF"
15
  FILENAME = "Qwen2.5-14B-Instruct-Q4_K_M.gguf"
 
 
16
  model_path = hf_hub_download(repo_id=REPO_ID, filename=FILENAME)
 
17
  print("Loading model...")
18
  llm = Llama(
19
  model_path=model_path,
20
- n_ctx=4096,
21
- n_threads=2, # Keep low for shared CPU
22
  verbose=False
23
  )
 
24
  def generate_pro(message, history):
25
- # Qwen 2.5 Prompt Format
26
- # <|im_start|>system\n...<|im_end|>\n<|im_start|>user\n...
27
-
28
- prompt = "<|im_start|>system\nYou are Lumin Pro, an intelligent AI assistant built by NovaPixel.<|im_end|>\n"
29
  for user_msg, bot_msg in history:
30
  prompt += f"<|im_start|>user\n{user_msg}<|im_end|>\n<|im_start|>assistant\n{bot_msg}<|im_end|>\n"
31
  prompt += f"<|im_start|>user\n{message}<|im_end|>\n<|im_start|>assistant\n"
 
32
  stream = llm.create_completion(
33
  prompt,
34
  max_tokens=2048,
35
  stop=["<|im_end|>"],
36
  stream=True,
37
- temperature=0.7
 
38
  )
 
39
  partial_text = ""
40
  for output in stream:
41
  delta = output['choices'][0]['text']
42
  partial_text += delta
43
  yield partial_text
 
44
  chat_interface = gr.ChatInterface(
45
  fn=generate_pro,
46
  title="🌟 Lumin Pro (Qwen 14B)",
47
- description="Running Qwen 2.5 14B Instruct (GGUF) locally.",
48
- examples=["Explain Quantum Physics", "Write a Python script for scraping"],
49
  )
 
50
  if __name__ == "__main__":
51
  chat_interface.launch(server_name="0.0.0.0", server_port=7860)
 
1
+ import gradio as gr
2
  from llama_cpp import Llama
3
  from huggingface_hub import hf_hub_download
4
+
5
+ # PRO: Qwen 2.5 14B Instruct (GGUF) - Q4_K_M
6
+ # Cabe en 16GB RAM (justo pero funciona)
 
 
 
 
 
 
 
7
  REPO_ID = "bartowski/Qwen2.5-14B-Instruct-GGUF"
8
  FILENAME = "Qwen2.5-14B-Instruct-Q4_K_M.gguf"
9
+
10
+ print(f"Downloading {FILENAME} from {REPO_ID}...")
11
  model_path = hf_hub_download(repo_id=REPO_ID, filename=FILENAME)
12
+
13
  print("Loading model...")
14
  llm = Llama(
15
  model_path=model_path,
16
+ n_ctx=8192, # Contexto moderado por RAM
17
+ n_threads=2, # CPU Friendly
18
  verbose=False
19
  )
20
+
21
  def generate_pro(message, history):
22
+ # Formato ChatML (Estandard de Qwen)
23
+ prompt = ""
 
 
24
  for user_msg, bot_msg in history:
25
  prompt += f"<|im_start|>user\n{user_msg}<|im_end|>\n<|im_start|>assistant\n{bot_msg}<|im_end|>\n"
26
  prompt += f"<|im_start|>user\n{message}<|im_end|>\n<|im_start|>assistant\n"
27
+
28
  stream = llm.create_completion(
29
  prompt,
30
  max_tokens=2048,
31
  stop=["<|im_end|>"],
32
  stream=True,
33
+ temperature=0.7,
34
+ top_p=0.9
35
  )
36
+
37
  partial_text = ""
38
  for output in stream:
39
  delta = output['choices'][0]['text']
40
  partial_text += delta
41
  yield partial_text
42
+
43
  chat_interface = gr.ChatInterface(
44
  fn=generate_pro,
45
  title="🌟 Lumin Pro (Qwen 14B)",
46
+ description="Running Qwen2.5-14B-Instruct (GGUF). Balanced Power.",
 
47
  )
48
+
49
  if __name__ == "__main__":
50
  chat_interface.launch(server_name="0.0.0.0", server_port=7860)