| use anyhow::Result; |
| use serde_json::Value; |
|
|
| |
| |
| pub async fn generate(redacted_prompt: &str) -> Result<(String, String, String)> { |
| let vllm_url = std::env::var("VLLM_URL") |
| .unwrap_or_else(|_| "http://localhost:8000/v1/completions".to_string()); |
|
|
| let client = reqwest::Client::new(); |
| let resp = client |
| .post(&vllm_url) |
| .header("ngrok-skip-browser-warning", "true") |
| .json(&serde_json::json!({ |
| "model": "Qwen/Qwen2.5-7B-Instruct", |
| "prompt": redacted_prompt, |
| "max_tokens": 250, |
| "temperature": 0.7 |
| })) |
| .send() |
| .await; |
|
|
| match resp { |
| Ok(r) if r.status().is_success() => { |
| let json: Value = r.json().await?; |
| let text = json["choices"][0]["text"] |
| .as_str() |
| .unwrap_or("No output") |
| .trim() |
| .to_string(); |
| tracing::info!("vLLM inference completed on MI300X ({} chars)", text.len()); |
| Ok((text, "7B (vLLM)".to_string(), "ROCm/MI300X".to_string())) |
| } |
| Ok(r) => { |
| tracing::warn!("vLLM returned {} – falling back to mock", r.status()); |
| Ok(( |
| "Triage result: non‑urgent (mock – GPU unavailable)".to_string(), |
| "mock".to_string(), |
| "CPU (fallback)".to_string(), |
| )) |
| } |
| Err(e) => { |
| tracing::warn!("vLLM unreachable: {} – falling back to mock", e); |
| Ok(( |
| "Triage result: non‑urgent (mock – GPU unavailable)".to_string(), |
| "mock".to_string(), |
| "CPU (fallback)".to_string(), |
| )) |
| } |
| } |
| } |
|
|