Spaces:

lablab-ai-amd-developer-hackathon
/

rustvital-amd

Running

Remove all auth to fix GPU connection

4e751f3 1 day ago

1.83 kB

	use anyhow::Result;
	use serde_json::Value;

	/// Call the real vLLM endpoint on AMD MI300X via ngrok tunnel.
	/// Falls back to mock if the GPU is unreachable.
	pub async fn generate(redacted_prompt: &str) -> Result<(String, String, String)> {
	let vllm_url = std::env::var("VLLM_URL")
	.unwrap_or_else(\|_\| "http://localhost:8000/v1/completions".to_string());

	let client = reqwest::Client::new();
	let resp = client
	.post(&vllm_url)
	.header("ngrok-skip-browser-warning", "true")
	.json(&serde_json::json!({
	"model": "Qwen/Qwen2.5-7B-Instruct",
	"prompt": redacted_prompt,
	"max_tokens": 250,
	"temperature": 0.7
	}))
	.send()
	.await;

	match resp {
	Ok(r) if r.status().is_success() => {
	let json: Value = r.json().await?;
	let text = json["choices"][0]["text"]
	.as_str()
	.unwrap_or("No output")
	.trim()
	.to_string();
	tracing::info!("vLLM inference completed on MI300X ({} chars)", text.len());
	Ok((text, "7B (vLLM)".to_string(), "ROCm/MI300X".to_string()))
	}
	Ok(r) => {
	tracing::warn!("vLLM returned {} – falling back to mock", r.status());
	Ok((
	"Triage result: non‑urgent (mock – GPU unavailable)".to_string(),
	"mock".to_string(),
	"CPU (fallback)".to_string(),
	))
	}
	Err(e) => {
	tracing::warn!("vLLM unreachable: {} – falling back to mock", e);
	Ok((
	"Triage result: non‑urgent (mock – GPU unavailable)".to_string(),
	"mock".to_string(),
	"CPU (fallback)".to_string(),
	))
	}
	}
	}