| import os |
| import gradio as gr |
| from openai import OpenAI |
|
|
| VLLM_BASE_URL = os.environ.get("VLLM_BASE_URL", "http://129.212.178.215:8000/v1") |
| MODEL_NAME = os.environ.get("MODEL_NAME", "Qwen/Qwen2.5-1.5B-Instruct") |
|
|
| client = OpenAI(base_url=VLLM_BASE_URL, api_key="not-required") |
|
|
|
|
| def chat(message, history): |
| messages = [{"role": "system", "content": "You are a helpful assistant."}] |
| for item in history: |
| if isinstance(item, dict): |
| messages.append({"role": item["role"], "content": item["content"]}) |
| else: |
| messages.append({"role": "user", "content": item[0]}) |
| if item[1]: |
| messages.append({"role": "assistant", "content": item[1]}) |
| messages.append({"role": "user", "content": message}) |
|
|
| stream = client.chat.completions.create( |
| model=MODEL_NAME, |
| messages=messages, |
| stream=True, |
| ) |
|
|
| partial = "" |
| for chunk in stream: |
| delta = chunk.choices[0].delta.content |
| if delta: |
| partial += delta |
| yield partial |
|
|
|
|
| demo = gr.ChatInterface( |
| fn=chat, |
| title="AMD MI300X AI Demo", |
| description="Chat with an LLM running on AMD MI300X GPU via vLLM.", |
| examples=["Explain what AMD MI300X is.", "Write a Python hello world."], |
| ) |
|
|
| if __name__ == "__main__": |
| demo.launch(server_name="0.0.0.0", server_port=7860) |
|
|