| import gradio as gr |
| from llama_cpp import Llama |
| from huggingface_hub import hf_hub_download |
| import threading |
|
|
| |
| TITLE = "AI Copilot for Patients" |
| DESCRIPTION = "I provide answers to concerns related to Health" |
|
|
| |
| llm_llama_cpp = None |
| model_ready = False |
|
|
| |
| def load_model(): |
| global llm_llama_cpp, model_ready |
| try: |
| print("Downloading model...") |
| model_file_path = hf_hub_download( |
| repo_id="TheBloke/Llama-2-7B-GGUF", |
| filename="llama-2-7b.Q4_0.gguf" |
| ) |
|
|
| print("Initializing model...") |
| llm_llama_cpp = Llama( |
| model_path=model_file_path, |
| verbose=False, |
| n_ctx=4096 |
| ) |
| model_ready = True |
| print("Model is ready.") |
| except Exception as e: |
| print(f"Failed to load model: {e}") |
|
|
| |
| threading.Thread(target=load_model).start() |
|
|
| |
| def talk(prompt, history): |
| if not model_ready: |
| return "⏳ Please wait, the model is still loading..." |
| |
| try: |
| response = "" |
| response_stream = llm_llama_cpp.create_completion( |
| prompt=prompt, |
| max_tokens=200, |
| stream=True |
| ) |
|
|
| for chunk in response_stream: |
| if 'choices' in chunk and 'text' in chunk['choices'][0]: |
| response += chunk['choices'][0]['text'] |
| return response |
|
|
| except Exception as e: |
| print(f"Error in generating response: {e}") |
| return f"Error during response generation: {e}" |
|
|
| |
| demo = gr.ChatInterface( |
| fn=talk, |
| chatbot=gr.Chatbot( |
| show_label=True, |
| show_share_button=True, |
| show_copy_button=True, |
| layout="bubble", |
| type="messages", |
| ), |
| theme="Soft", |
| examples=[["what is Diabetes?"]], |
| title=TITLE, |
| description=DESCRIPTION, |
| ) |
|
|
| |
| demo.launch(share=True) |