use anyhow::Result; use serde_json::Value; /// Call the real vLLM endpoint on AMD MI300X via ngrok tunnel. /// Falls back to mock if the GPU is unreachable. pub async fn generate(redacted_prompt: &str) -> Result<(String, String, String)> { let vllm_url = std::env::var("VLLM_URL") .unwrap_or_else(|_| "http://localhost:8000/v1/completions".to_string()); let client = reqwest::Client::new(); let resp = client .post(&vllm_url) .header("ngrok-skip-browser-warning", "true") .json(&serde_json::json!({ "model": "Qwen/Qwen2.5-7B-Instruct", "prompt": redacted_prompt, "max_tokens": 250, "temperature": 0.7 })) .send() .await; match resp { Ok(r) if r.status().is_success() => { let json: Value = r.json().await?; let text = json["choices"][0]["text"] .as_str() .unwrap_or("No output") .trim() .to_string(); tracing::info!("vLLM inference completed on MI300X ({} chars)", text.len()); Ok((text, "7B (vLLM)".to_string(), "ROCm/MI300X".to_string())) } Ok(r) => { tracing::warn!("vLLM returned {} – falling back to mock", r.status()); Ok(( "Triage result: non‑urgent (mock – GPU unavailable)".to_string(), "mock".to_string(), "CPU (fallback)".to_string(), )) } Err(e) => { tracing::warn!("vLLM unreachable: {} – falling back to mock", e); Ok(( "Triage result: non‑urgent (mock – GPU unavailable)".to_string(), "mock".to_string(), "CPU (fallback)".to_string(), )) } } }