Spaces:

lablab-ai-amd-developer-hackathon
/

BrainConnect-ASD

Running

App Files Files Community

Yatsuiii commited on 2 days ago

Commit

b050a20

verified ·

1 Parent(s): c15db7e

LLM: switch to HF InferenceClient (merged model, always-on)

Browse files

Files changed (1) hide show

app.py +11 -41

app.py CHANGED Viewed

@@ -4,6 +4,7 @@ BrainConnect-ASD — Scanner-site-invariant ASD detection from fMRI.
 from __future__ import annotations
 import io
 from pathlib import Path
 import numpy as np
@@ -134,9 +135,11 @@ def preprocess(bold):
     bw   = _windows(bold)
     return torch.FloatTensor(bw).unsqueeze(0), torch.FloatTensor(adj).unsqueeze(0)
-# ── LLM (Qwen2.5-7B LoRA fine-tuned on AMD MI300X) ────────────────────────
-_LLM_MODEL = "Yatsuiii/asd-interpreter-lora"
 _SYSTEM_PROMPT = (
     "You are a clinical AI assistant specializing in functional MRI brain "
     "connectivity analysis for autism spectrum disorder (ASD) diagnosis support. "
@@ -151,21 +154,6 @@ _SYSTEM_PROMPT = (
     "input, do not mention it. (4) Always clarify findings are AI-assisted and require "
     "full clinical assessment. You do not make a diagnosis."
 )
-_llm_cache = None
-def get_llm():
-    global _llm_cache
-    if _llm_cache is not None:
-        return _llm_cache
-    from transformers import AutoModelForCausalLM, AutoTokenizer
-    tok = AutoTokenizer.from_pretrained(_LLM_MODEL)
-    tok.pad_token = tok.eos_token
-    mdl = AutoModelForCausalLM.from_pretrained(
-        _LLM_MODEL, torch_dtype=torch.bfloat16, device_map="auto"
-    )
-    mdl.eval()
-    _llm_cache = (mdl, tok)
-    return _llm_cache
 def _llm_report(p_mean: float, per_model: list, net_saliency: dict | None = None) -> str:
     consensus = sum(1 for _, p in per_model if p > 0.5)
@@ -203,20 +191,14 @@ def _llm_report(p_mean: float, per_model: list, net_saliency: dict | None = None
         f"and values listed above. Do not mention any network not in this report."
     )
     try:
-        mdl, tok = get_llm()
         messages = [
             {"role": "system", "content": _SYSTEM_PROMPT},
             {"role": "user",   "content": user_msg},
         ]
-        text = tok.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-        inputs = tok(text, return_tensors="pt").to(next(mdl.parameters()).device)
-        with torch.no_grad():
-            out = mdl.generate(
-                **inputs, max_new_tokens=512, temperature=0.1,
-                do_sample=True, pad_token_id=tok.eos_token_id,
-            )
-        generated = out[0][inputs["input_ids"].shape[1]:]
-        return tok.decode(generated, skip_special_tokens=True).strip()
     except Exception as e:
         return f"[LLM unavailable: {e}]"
@@ -699,20 +681,8 @@ AI-assisted screening only · Not a clinical diagnosis · Findings must be integ
     # LLM clinical interpretation (only attempt if GPU is available)
     import os
-    _has_gpu = torch.cuda.is_available() or (hasattr(torch, "hip") and torch.hip.is_available() if hasattr(torch, "hip") else False)
-    if _has_gpu:
-        llm_text = _llm_report(p_mean, per_model, net_saliency=net_saliency)
-        llm_block = f'<div style="color:#cbd5e1;font-size:0.85rem;line-height:1.7;white-space:pre-wrap">{llm_text}</div>'
-    else:
-        llm_block = """
-<div style="color:#8b95a7;font-size:0.84rem;line-height:1.6">
-  Qwen2.5-7B LoRA interpreter is active — fine-tuned on AMD Instinct MI300X (192 GB HBM3, ROCm 7.0, bf16).
-  GPU inference is required to run it in real-time. The full model is available at
-  <span style="color:#fb923c">Yatsuiii/asd-interpreter-lora</span> on Hugging Face.
-  <br><br>
-  <span style="color:#5e6675">Clinical interpretation pipeline: GCN ensemble → per-network saliency extraction →
-  Qwen2.5-7B generates grounded clinical summary referencing only the actual saliency values.</span>
-</div>"""
     report += f"""
 <div style="background:#0f1a1a;border:1px solid #1a3a3a;border-radius:8px;padding:18px 24px;margin-top:12px">
 <div style="display:flex;align-items:center;gap:10px;margin-bottom:10px">

 from __future__ import annotations
 import io
+import os
 from pathlib import Path
 import numpy as np
     bw   = _windows(bold)
     return torch.FloatTensor(bw).unsqueeze(0), torch.FloatTensor(adj).unsqueeze(0)
+# ── LLM (Qwen2.5-7B fine-tuned on AMD MI300X, served via HF Inference API) ─
+_LLM_MODEL = "Yatsuiii/asd-interpreter-merged"
+_HF_TOKEN   = os.environ.get("HF_TOKEN", "")
 _SYSTEM_PROMPT = (
     "You are a clinical AI assistant specializing in functional MRI brain "
     "connectivity analysis for autism spectrum disorder (ASD) diagnosis support. "
     "input, do not mention it. (4) Always clarify findings are AI-assisted and require "
     "full clinical assessment. You do not make a diagnosis."
 )
 def _llm_report(p_mean: float, per_model: list, net_saliency: dict | None = None) -> str:
     consensus = sum(1 for _, p in per_model if p > 0.5)
         f"and values listed above. Do not mention any network not in this report."
     )
     try:
+        from huggingface_hub import InferenceClient
+        client = InferenceClient(model=_LLM_MODEL, token=_HF_TOKEN or None)
         messages = [
             {"role": "system", "content": _SYSTEM_PROMPT},
             {"role": "user",   "content": user_msg},
         ]
+        response = client.chat_completion(messages=messages, max_tokens=512, temperature=0.1)
+        return response.choices[0].message.content.strip()
     except Exception as e:
         return f"[LLM unavailable: {e}]"
     # LLM clinical interpretation (only attempt if GPU is available)
     import os
+    llm_text = _llm_report(p_mean, per_model, net_saliency=net_saliency)
+    llm_block = f'<div style="color:#cbd5e1;font-size:0.85rem;line-height:1.7;white-space:pre-wrap">{llm_text}</div>'
     report += f"""
 <div style="background:#0f1a1a;border:1px solid #1a3a3a;border-radius:8px;padding:18px 24px;margin-top:12px">
 <div style="display:flex;align-items:center;gap:10px;margin-bottom:10px">