brainworm2024's picture
Remove all auth to fix GPU connection
4e751f3
use anyhow::Result;
use serde_json::Value;
/// Call the real vLLM endpoint on AMD MI300X via ngrok tunnel.
/// Falls back to mock if the GPU is unreachable.
pub async fn generate(redacted_prompt: &str) -> Result<(String, String, String)> {
let vllm_url = std::env::var("VLLM_URL")
.unwrap_or_else(|_| "http://localhost:8000/v1/completions".to_string());
let client = reqwest::Client::new();
let resp = client
.post(&vllm_url)
.header("ngrok-skip-browser-warning", "true")
.json(&serde_json::json!({
"model": "Qwen/Qwen2.5-7B-Instruct",
"prompt": redacted_prompt,
"max_tokens": 250,
"temperature": 0.7
}))
.send()
.await;
match resp {
Ok(r) if r.status().is_success() => {
let json: Value = r.json().await?;
let text = json["choices"][0]["text"]
.as_str()
.unwrap_or("No output")
.trim()
.to_string();
tracing::info!("vLLM inference completed on MI300X ({} chars)", text.len());
Ok((text, "7B (vLLM)".to_string(), "ROCm/MI300X".to_string()))
}
Ok(r) => {
tracing::warn!("vLLM returned {} – falling back to mock", r.status());
Ok((
"Triage result: non‑urgent (mock – GPU unavailable)".to_string(),
"mock".to_string(),
"CPU (fallback)".to_string(),
))
}
Err(e) => {
tracing::warn!("vLLM unreachable: {} – falling back to mock", e);
Ok((
"Triage result: non‑urgent (mock – GPU unavailable)".to_string(),
"mock".to_string(),
"CPU (fallback)".to_string(),
))
}
}
}