Yatsuiii commited on
Commit
e83206b
·
verified ·
1 Parent(s): d84066a

LLM: vLLM on AMD MI300X (OpenAI-compat) with HF fallback

Browse files
Files changed (1) hide show
  1. app.py +23 -5
app.py CHANGED
@@ -135,9 +135,10 @@ def preprocess(bold):
135
  bw = _windows(bold)
136
  return torch.FloatTensor(bw).unsqueeze(0), torch.FloatTensor(adj).unsqueeze(0)
137
 
138
- # ── LLM (Qwen2.5-7B fine-tuned on AMD MI300X, served via HF Inference API) ─
139
 
140
- _LLM_MODEL = "Yatsuiii/asd-interpreter-merged"
 
141
  _HF_TOKEN = os.environ.get("HF_TOKEN", "")
142
 
143
  _SYSTEM_PROMPT = (
@@ -191,13 +192,30 @@ def _llm_report(p_mean: float, per_model: list, net_saliency: dict | None = None
191
  f"and values listed above. Do not mention any network not in this report."
192
  )
193
  try:
194
- from huggingface_hub import InferenceClient
195
- client = InferenceClient(model=_LLM_MODEL, token=_HF_TOKEN or None)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
196
  messages = [
197
  {"role": "system", "content": _SYSTEM_PROMPT},
198
  {"role": "user", "content": user_msg},
199
  ]
200
- response = client.chat_completion(messages=messages, max_tokens=512, temperature=0.1)
 
 
201
  return response.choices[0].message.content.strip()
202
  except Exception as e:
203
  return f"[LLM unavailable: {e}]"
 
135
  bw = _windows(bold)
136
  return torch.FloatTensor(bw).unsqueeze(0), torch.FloatTensor(adj).unsqueeze(0)
137
 
138
+ # ── LLM (Qwen2.5-7B fine-tuned on AMD MI300X, served via vLLM on MI300X) ───
139
 
140
+ _VLLM_URL = os.environ.get("VLLM_BASE_URL", "")
141
+ _LLM_MODEL = "Yatsuiii/asd-interpreter-merged"
142
  _HF_TOKEN = os.environ.get("HF_TOKEN", "")
143
 
144
  _SYSTEM_PROMPT = (
 
192
  f"and values listed above. Do not mention any network not in this report."
193
  )
194
  try:
195
+ from openai import OpenAI
196
+ if _VLLM_URL:
197
+ # Live AMD MI300X inference via vLLM
198
+ client = OpenAI(base_url=_VLLM_URL, api_key="not-required")
199
+ model_id = _LLM_MODEL
200
+ else:
201
+ # Fallback: HF Inference API
202
+ from huggingface_hub import InferenceClient as _HFClient
203
+ client = _HFClient(model=_LLM_MODEL, token=_HF_TOKEN or None)
204
+ response = client.chat_completion(
205
+ messages=[
206
+ {"role": "system", "content": _SYSTEM_PROMPT},
207
+ {"role": "user", "content": user_msg},
208
+ ],
209
+ max_tokens=512, temperature=0.1,
210
+ )
211
+ return response.choices[0].message.content.strip()
212
  messages = [
213
  {"role": "system", "content": _SYSTEM_PROMPT},
214
  {"role": "user", "content": user_msg},
215
  ]
216
+ response = client.chat.completions.create(
217
+ model=model_id, messages=messages, max_tokens=512, temperature=0.1
218
+ )
219
  return response.choices[0].message.content.strip()
220
  except Exception as e:
221
  return f"[LLM unavailable: {e}]"