File size: 1,826 Bytes
87840bf bece13e 4a90885 fbc77a0 bece13e fbc77a0 bece13e 4e751f3 51a1ab7 bece13e e08aa80 bece13e e08aa80 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 | use anyhow::Result;
use serde_json::Value;
/// Call the real vLLM endpoint on AMD MI300X via ngrok tunnel.
/// Falls back to mock if the GPU is unreachable.
pub async fn generate(redacted_prompt: &str) -> Result<(String, String, String)> {
let vllm_url = std::env::var("VLLM_URL")
.unwrap_or_else(|_| "http://localhost:8000/v1/completions".to_string());
let client = reqwest::Client::new();
let resp = client
.post(&vllm_url)
.header("ngrok-skip-browser-warning", "true")
.json(&serde_json::json!({
"model": "Qwen/Qwen2.5-7B-Instruct",
"prompt": redacted_prompt,
"max_tokens": 250,
"temperature": 0.7
}))
.send()
.await;
match resp {
Ok(r) if r.status().is_success() => {
let json: Value = r.json().await?;
let text = json["choices"][0]["text"]
.as_str()
.unwrap_or("No output")
.trim()
.to_string();
tracing::info!("vLLM inference completed on MI300X ({} chars)", text.len());
Ok((text, "7B (vLLM)".to_string(), "ROCm/MI300X".to_string()))
}
Ok(r) => {
tracing::warn!("vLLM returned {} – falling back to mock", r.status());
Ok((
"Triage result: non‑urgent (mock – GPU unavailable)".to_string(),
"mock".to_string(),
"CPU (fallback)".to_string(),
))
}
Err(e) => {
tracing::warn!("vLLM unreachable: {} – falling back to mock", e);
Ok((
"Triage result: non‑urgent (mock – GPU unavailable)".to_string(),
"mock".to_string(),
"CPU (fallback)".to_string(),
))
}
}
}
|