| import os | |
| import gradio as gr | |
| from openai import OpenAI | |
| from dotenv import load_dotenv | |
| load_dotenv() | |
| VLLM_BASE_URL = os.environ.get("VLLM_BASE_URL", "http://localhost:8000/v1") | |
| MODEL_NAME = os.environ.get("MODEL_NAME", "meta-llama/Llama-3.1-8B-Instruct") | |
| client = OpenAI(base_url=VLLM_BASE_URL, api_key="not-required") | |
| def chat(message, history): | |
| messages = [{"role": "system", "content": "You are a helpful assistant."}] | |
| for item in history: | |
| messages.append({"role": item["role"], "content": item["content"]}) | |
| messages.append({"role": "user", "content": message}) | |
| stream = client.chat.completions.create( | |
| model=MODEL_NAME, | |
| messages=messages, | |
| stream=True, | |
| ) | |
| partial = "" | |
| for chunk in stream: | |
| delta = chunk.choices[0].delta.content | |
| if delta: | |
| partial += delta | |
| yield partial | |
| demo = gr.ChatInterface( | |
| fn=chat, | |
| type="messages", | |
| title="AMD MI300X AI Demo", | |
| description="Chat with an LLM running on AMD MI300X GPU via vLLM.", | |
| examples=["Explain what AMD MI300X is.", "Write a Python hello world."], | |
| cache_examples=False, | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() | |