Spaces:

lablab-ai-amd-developer-hackathon
/

rustvital-amd

Running

App Files Files Community

brainworm2024 commited on 1 day ago

Commit

bece13e

1 Parent(s): f21e0aa

Connect to real AMD MI300X via vLLM

Browse files

Files changed (1) hide show

src/inference/qwen.rs +51 -11

src/inference/qwen.rs CHANGED Viewed

@@ -1,13 +1,53 @@
 use anyhow::Result;
-/// Mock inference for local testing / HF Space CPU.
-/// Returns (generated_text, model_used, device_info)
-pub async fn generate(_redacted_prompt: &str) -> Result<(String, String, String)> {
-    tracing::info!("[MOCK] Inference skipped – returning placeholder");
-    tokio::time::sleep(std::time::Duration::from_millis(10)).await;
-    Ok((
-        "Triage result: non‑urgent (mock)".to_string(),
-        "mock".to_string(),
-        "CPU (mock)".to_string(),
-    ))
-}

 use anyhow::Result;
+use serde_json::Value;
+/// Call the real vLLM endpoint on AMD MI300X.
+/// Falls back to mock if the GPU is unreachable.
+pub async fn generate(redacted_prompt: &str) -> Result<(String, String, String)> {
+    let vllm_url = std::env::var("VLLM_URL")
+        .unwrap_or_else(|_| "http://localhost:8000/v1/completions".to_string());
+    let api_key = std::env::var("VLLM_API_KEY")
+        .unwrap_or_else(|_| "abc-123".to_string());
+    let client = reqwest::Client::new();
+    let resp = client
+        .post(&vllm_url)
+        .header("Authorization", format!("Bearer {}", api_key))
+        .json(&serde_json::json!({
+            "model": "Qwen/Qwen2.5-7B-Instruct",
+            "prompt": redacted_prompt,
+            "max_tokens": 250,
+            "temperature": 0.7
+        }))
+        .send()
+        .await;
+    match resp {
+        Ok(r) if r.status().is_success() => {
+            let json: Value = r.json().await?;
+            let text = json["choices"][0]["text"]
+                .as_str()
+                .unwrap_or("No output")
+                .trim()
+                .to_string();
+            tracing::info!("vLLM inference completed on MI300X ({} chars)", text.len());
+            Ok((text, "7B (vLLM)".to_string(), "ROCm/MI300X".to_string()))
+        }
+        Ok(r) => {
+            tracing::warn!("vLLM returned {} – falling back to mock", r.status());
+            Ok((
+                "Triage result: non‑urgent (mock – GPU unavailable)".to_string(),
+                "mock".to_string(),
+                "CPU (fallback)".to_string(),
+            ))
+        }
+        Err(e) => {
+            tracing::warn!("vLLM unreachable: {} – falling back to mock", e);
+            Ok((
+                "Triage result: non‑urgent (mock – GPU unavailable)".to_string(),
+                "mock".to_string(),
+                "CPU (fallback)".to_string(),
+            ))
+        }
+    }
+}