Spaces:

Goated121
/

ChatBot

Sleeping

Goated121 commited on 24 days ago

Commit

e540d02

verified ·

1 Parent(s): 91b4de2

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -68,30 +68,27 @@ def retrieve_context(query):
     return context.strip()
 # -----------------------------
-# Load FAST model (CPU friendly)
 # -----------------------------
-model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
-print("Loading fast model...")
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 model = AutoModelForCausalLM.from_pretrained(
     model_name,
-    torch_dtype=torch.float32
 )
 generator = pipeline(
     "text-generation",
     model=model,
     tokenizer=tokenizer,
-    max_new_tokens=120,
     do_sample=True,
     temperature=0.6,
-    device=-1  # CPU
 )
-print("Fast LLM loaded successfully!")
 # -----------------------------
 # Chat function
@@ -99,10 +96,6 @@ print("Fast LLM loaded successfully!")
 def chat(user_input):
     context = retrieve_context(user_input)
-    # ⚡ Instant response if context is already short
-    if context and len(context.split()) < 50:
-        return context.strip()
     if not context:
         return "I don't know."
@@ -120,8 +113,7 @@ Question:
 Answer in short and clear sentences.
 """
-    response = generator(prompt)
     text = response[0]["generated_text"]
     # Remove prompt if repeated
@@ -137,6 +129,6 @@ gr.Interface(
     fn=chat,
     inputs="text",
     outputs="text",
-    title="Livestock Chatbot (RAG + Fast LLM)",
-    description="Fast chatbot using RAG + TinyLlama (optimized for CPU)"
 ).launch()

     return context.strip()
 # -----------------------------
+# Load Qwen model (CPU only, no accelerate)
 # -----------------------------
+model_name = "Qwen/Qwen2.5-1.5B-Instruct"
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 model = AutoModelForCausalLM.from_pretrained(
     model_name,
+    torch_dtype=torch.float32  # CPU only
 )
 generator = pipeline(
     "text-generation",
     model=model,
     tokenizer=tokenizer,
+    max_new_tokens=150,
     do_sample=True,
     temperature=0.6,
+    device=-1  # ensures CPU is used
 )
+print("LLM loaded successfully!")
 # -----------------------------
 # Chat function
 def chat(user_input):
     context = retrieve_context(user_input)
     if not context:
         return "I don't know."
 Answer in short and clear sentences.
 """
+    response = generator(prompt, max_new_tokens=150, do_sample=True, temperature=0.6)
     text = response[0]["generated_text"]
     # Remove prompt if repeated
     fn=chat,
     inputs="text",
     outputs="text",
+    title="Livestock Chatbot (RAG + Qwen)",
+    description="This chatbot answers livestock questions using RAG retrieval and Qwen model generation."
 ).launch()