use anyhow::Result;
use serde_json::Value;

/// Call the real vLLM endpoint on AMD MI300X via ngrok tunnel.
/// Falls back to mock if the GPU is unreachable.
pub async fn generate(redacted_prompt: &str) -> Result<(String, String, String)> {
    let vllm_url = std::env::var("VLLM_URL")
        .unwrap_or_else(|_| "http://localhost:8000/v1/completions".to_string());

    let client = reqwest::Client::new();
    let resp = client
        .post(&vllm_url)
        .header("ngrok-skip-browser-warning", "true")
        .json(&serde_json::json!({
            "model": "Qwen/Qwen2.5-7B-Instruct",
            "prompt": redacted_prompt,
            "max_tokens": 250,
            "temperature": 0.7
        }))
        .send()
        .await;

    match resp {
        Ok(r) if r.status().is_success() => {
            let json: Value = r.json().await?;
            let text = json["choices"][0]["text"]
                .as_str()
                .unwrap_or("No output")
                .trim()
                .to_string();
            tracing::info!("vLLM inference completed on MI300X ({} chars)", text.len());
            Ok((text, "7B (vLLM)".to_string(), "ROCm/MI300X".to_string()))
        }
        Ok(r) => {
            tracing::warn!("vLLM returned {} – falling back to mock", r.status());
            Ok((
                "Triage result: non‑urgent (mock – GPU unavailable)".to_string(),
                "mock".to_string(),
                "CPU (fallback)".to_string(),
            ))
        }
        Err(e) => {
            tracing::warn!("vLLM unreachable: {} – falling back to mock", e);
            Ok((
                "Triage result: non‑urgent (mock – GPU unavailable)".to_string(),
                "mock".to_string(),
                "CPU (fallback)".to_string(),
            ))
        }
    }
}