import os import gradio as gr from openai import OpenAI from dotenv import load_dotenv load_dotenv() VLLM_BASE_URL = os.environ.get("VLLM_BASE_URL", "http://localhost:8000/v1") MODEL_NAME = os.environ.get("MODEL_NAME", "meta-llama/Llama-3.1-8B-Instruct") client = OpenAI(base_url=VLLM_BASE_URL, api_key="not-required") def chat(message, history): messages = [{"role": "system", "content": "You are a helpful assistant."}] for item in history: if isinstance(item, dict): messages.append({"role": item["role"], "content": item["content"]}) else: messages.append({"role": "user", "content": item[0]}) if item[1]: messages.append({"role": "assistant", "content": item[1]}) messages.append({"role": "user", "content": message}) stream = client.chat.completions.create( model=MODEL_NAME, messages=messages, stream=True, ) partial = "" for chunk in stream: delta = chunk.choices[0].delta.content if delta: partial += delta yield partial demo = gr.ChatInterface( fn=chat, title="AMD MI300X AI Demo", description="Chat with an LLM running on AMD MI300X GPU via vLLM.", examples=["Explain what AMD MI300X is.", "Write a Python hello world."], cache_examples=False, ) if __name__ == "__main__": demo.launch()