Spaces:

lablab-ai-amd-developer-hackathon
/

navigator-vision-assistant

Running on Zero

App Files Files Community

navigator-vision-assistant / app.py

IRORUN

remmove video stream not good enough cause of GPU allocated per-function-call

74f14ed 1 day ago

raw

history blame contribute delete

12.1 kB

	import spaces

	import gradio as gr
	import cv2
	import numpy as np
	from ultralytics import YOLO
	from PIL import Image

	# Load YOLO once at startup
	# yolov8n.pt = nano model, fastest, best for CPU
	# Downloads automatically on first run (~6MB)
	print("Loading YOLO...")
	model = YOLO("yolov8n.pt")
	model.to("cpu") # keep on CPU at load time
	print("YOLO ready")

	# Colour map — BGR format (OpenCV uses Blue-Green-Red)
	COLOURS = {
	"person": (0, 200, 255),
	"chair": (255, 100, 0),
	"bottle": (0, 255, 100),
	"laptop": (255, 200, 0),
	"car": (0, 100, 255),
	"dog": (180, 0, 255),
	"cat": (255, 0, 180),
	"cup": (0, 255, 220),
	"book": (100, 255, 0),
	"phone": (255, 255, 0),
	"backpack":(255, 140, 0),
	"umbrella":(200, 0, 255),
	}
	DEFAULT_COLOUR = (200, 200, 200)

	# Known real-world heights in cm for distance estimation
	KNOWN_HEIGHTS_CM = {
	"person": 170, "door": 200, "chair": 90,
	"car": 150, "bottle": 25, "cup": 10,
	"laptop": 25, "stop sign": 75,
	}
	FOCAL_LENGTH_PX = 700 # approximate for standard webcam

	def estimate_distance(cls_name, bbox_height_px):
	"""
	Estimates distance using pinhole camera formula.
	distance = (real_height_cm × focal_length) / pixel_height

	Example: person is 170cm tall in real life.
	If they appear 340px tall in frame → distance = (170×700)/340 = 350cm = 3.5m
	"""
	real_h = KNOWN_HEIGHTS_CM.get(cls_name)
	if not real_h or bbox_height_px < 5:
	return None
	dist_cm = (real_h * FOCAL_LENGTH_PX) / bbox_height_px
	dist_m = dist_cm / 100

	if dist_m < 0.8:
	return "within arm's reach"
	elif dist_m < 1.5:
	return f"{dist_m:.1f}m — very close"
	elif dist_m < 4.0:
	return f"{dist_m:.1f}m ahead"
	else:
	return f"{dist_m:.0f}m away"


	def draw_trapezoid(frame, x1, y1, x2, y2, colour):
	"""
	Draws a perspective trapezoid from the object down to the
	bottom centre of the frame — showing the path to the object.

	Wide at bottom (user position) → narrow at top (object position).
	The further the object, the narrower the top naturally is
	because YOLO gives a smaller box for distant objects.

	This gives an instant visual sense of depth and direction.
	"""
	h, w = frame.shape[:2]

	# Bottom centre of frame = where the user is standing
	bottom_left = (w // 2 - 40, h - 10)
	bottom_right = (w // 2 + 40, h - 10)

	# Top of trapezoid = bottom of the object's bounding box
	top_left = (x1, y2)
	top_right = (x2, y2)

	# Draw the two converging side lines
	# These are the "path lines" from user to object
	cv2.line(frame, bottom_left, top_left, colour, 1, cv2.LINE_AA)
	cv2.line(frame, bottom_right, top_right, colour, 1, cv2.LINE_AA)

	# Draw a semi-transparent filled trapezoid to show the path area
	pts = np.array([bottom_left, bottom_right,
	top_right, top_left], dtype=np.int32)
	overlay = frame.copy()
	cv2.fillPoly(overlay, [pts], colour)
	cv2.addWeighted(overlay, 0.08, frame, 0.92, 0, frame)

	return frame

	@spaces.GPU
	def analyse(image):
	if image is None:
	return None, "No image provided."

	# Move model to GPU for this request
	model.to("cuda")

	# Compress first
	max_width = 640
	w, h = image.size
	if w > max_width:
	ratio = max_width / w
	image = image.resize((max_width, int(h * ratio)), Image.LANCZOS)

	# Convert PIL (RGB) → OpenCV (BGR)
	# PIL uses Red-Green-Blue, OpenCV uses Blue-Green-Red
	# We flip the channels with [:,:,::-1]
	frame = np.array(image)
	frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)

	# Run YOLO — returns list of results
	results = model(frame, verbose=False, conf=0.45)

	annotated = frame.copy()
	detections = []
	spoken_parts = []

	if results[0].boxes is not None:
	for box in results[0].boxes:
	# Extract box coordinates
	x1, y1, x2, y2 = box.xyxy[0].cpu().numpy().astype(int)
	cls_name = model.names[int(box.cls[0])]
	confidence = float(box.conf[0])
	colour = COLOURS.get(cls_name, DEFAULT_COLOUR)

	# ── Distance estimation ────────────────────────────
	bbox_h = y2 - y1
	distance = estimate_distance(cls_name, bbox_h)

	# ── Draw perspective trapezoid first (behind box) ──
	annotated = draw_trapezoid(annotated, x1, y1, x2, y2, colour)

	# ── Semi-transparent box fill ──────────────────────
	overlay = annotated.copy()
	cv2.rectangle(overlay, (x1,y1), (x2,y2), colour, -1)
	cv2.addWeighted(overlay, 0.12, annotated, 0.88, 0, annotated)

	# ── Box border ─────────────────────────────────────
	cv2.rectangle(annotated, (x1,y1), (x2,y2), colour, 2)

	# ── CCTV corner brackets ───────────────────────────
	L, T = 16, 3
	cv2.line(annotated,(x1,y1),(x1+L,y1),colour,T)
	cv2.line(annotated,(x1,y1),(x1,y1+L),colour,T)
	cv2.line(annotated,(x2,y1),(x2-L,y1),colour,T)
	cv2.line(annotated,(x2,y1),(x2,y1+L),colour,T)
	cv2.line(annotated,(x1,y2),(x1+L,y2),colour,T)
	cv2.line(annotated,(x1,y2),(x1,y2-L),colour,T)
	cv2.line(annotated,(x2,y2),(x2-L,y2),colour,T)
	cv2.line(annotated,(x2,y2),(x2,y2-L),colour,T)

	# ── Top label: CLASS \| CONFIDENCE ─────────────────
	label = f"{cls_name.upper()} {confidence*100:.0f}%"
	font = cv2.FONT_HERSHEY_SIMPLEX
	(tw, th), _ = cv2.getTextSize(label, font, 0.55, 1)
	cv2.rectangle(annotated,(x1,y1-th-10),(x1+tw+8,y1),colour,-1)
	cv2.putText(annotated, label,(x1+4,y1-6),
	font, 0.55, (0,0,0), 1, cv2.LINE_AA)

	# ── Distance label below box ───────────────────────
	if distance:
	cv2.putText(annotated, distance,
	(x1+4, y2+18),
	font, 0.45, colour, 1, cv2.LINE_AA)

	# ── Build detection record ─────────────────────────
	det_text = f"→ {cls_name} ({confidence*100:.0f}%)"
	if distance:
	det_text += f" \| {distance}"
	detections.append(det_text)

	# ── Build spoken description part ──────────────────
	spoken = cls_name
	if distance:
	spoken += f", {distance}"
	spoken_parts.append(spoken)

	# ── Convert back to RGB for Gradio ────────────────────────
	rgb = cv2.cvtColor(annotated, cv2.COLOR_BGR2RGB)

	# ── Build final output text ────────────────────────────────
	if spoken_parts:
	spoken_sentence = "Navigator sees: " + ". ".join(spoken_parts)
	summary = "\n".join(detections)
	else:
	spoken_sentence = "Nothing clearly detected in this scene."
	summary = "No objects detected."

	output_text = f"Navigator would say:\n\"{spoken_sentence}\"\n\n[] Detections:\n{summary}"

	return Image.fromarray(rgb), output_text


	# ─────────────────────────────────────────────────────────────
	# GRADIO UI
	# ─────────────────────────────────────────────────────────────
	with gr.Blocks(title="Navigator — AI Sight Assistant") as demo:

	gr.Markdown("""
	# Navigator — AI Sight Assistant
	Real-time scene description for visually impaired users

	> "Empowering blind users to hear their surroundings through AI"

	Built for the AMD Developer Hackathon · Track 3: Vision & Multimodal AI
	Powered by YOLOv8 · LLaVA · FER · LiteRun optimizer · AMD MI300X + ROCm
	""")

	gr.Markdown("---")

	# ── How it works status table ──────────────────────────────
	gr.Markdown("""
	### Live Demo Status
	\| Module \| Status \| What it does \|
	\|--------\|--------\|--------------\|
	\| YOLOv8 object detection \| Live \| Finds objects in 50ms \|
	\| Distance estimation \| Live \| Calculates metres to each object \|
	\| Perspective path lines \| Live \| Draws converging lines to show depth \|
	\| FER emotion detection \| Integrating \| Reads facial expressions \|
	\| LLaVA scene description \| Integrating \| Deep natural language descriptions \|
	\| Text-to-speech output \| Integrating \| Speaks result into earpiece \|
	\| LiteRun memory optimizer \| AMD cloud \| Fits 16GB model into 6GB VRAM \|
	""")

	gr.Markdown("---")

	gr.Markdown("### Try it — use your webcam or upload an image")

	with gr.Tabs():

	# with gr.Tab("Live Webcam"):
	# with gr.Row():
	# with gr.Column(scale=1):
	# webcam_inp = gr.Image(
	# sources=["webcam"],
	# type="pil",
	# label="Webcam",
	# streaming=True
	# )
	# with gr.Column(scale=1):
	# webcam_out_img = gr.Image(label="Annotated Feed")
	# webcam_out_text = gr.Textbox(
	# label="Navigator Output",
	# lines=6
	# )

	# # Auto-analyse every 0.05 seconds from webcam
	# webcam_inp.stream(
	# fn=analyse,
	# inputs=webcam_inp,
	# outputs=[webcam_out_img, webcam_out_text],
	# time_limit=60,
	# stream_every=1.5
	# )

	with gr.Tab("Upload Image"):
	with gr.Row():
	with gr.Column(scale=1):
	upload_inp = gr.Image(
	sources=["upload"],
	type="pil",
	label="Upload Image"
	)
	btn = gr.Button("Analyse", variant="primary", size="lg")
	with gr.Column(scale=1):
	upload_out_img = gr.Image(label="Annotated Feed")
	upload_out_text = gr.Textbox(
	label="Navigator Output",
	lines=6
	)

	btn.click(fn=analyse, inputs=upload_inp, outputs=[upload_out_img, upload_out_text])

	gr.Markdown("---")

	gr.Markdown("""
	### How Navigator works

	Your Camera
	↓
	YOLO detects every object (50ms — fast path)
	↓
	Distance calculated per object using pinhole camera model
	↓
	Perspective trapezoid drawn showing path to each object
	↓
	FER reads emotion of each detected person (coming soon)
	↓
	LLaVA describes full scene in natural language (coming soon)
	↓
	TTS speaks result into blind user's earpiece (coming soon)
	### ⚡ LiteRun — Our Memory Optimizer
	LLaVA normally needs 16GB VRAM to run.
	LiteRun squeezes it into 6GB using quantization + memory routing + KV cache compression.
	Before OptiLite: crashes out of memory
	After OptiLite: runs smoothly on AMD hardware

	Full real-time webcam demo with audio coming Day 7 🔥
	""")


	demo.launch()