import spaces import gradio as gr import cv2 import numpy as np from ultralytics import YOLO from PIL import Image # Load YOLO once at startup # yolov8n.pt = nano model, fastest, best for CPU # Downloads automatically on first run (~6MB) print("Loading YOLO...") model = YOLO("yolov8n.pt") model.to("cpu") # keep on CPU at load time print("YOLO ready") # Colour map — BGR format (OpenCV uses Blue-Green-Red) COLOURS = { "person": (0, 200, 255), "chair": (255, 100, 0), "bottle": (0, 255, 100), "laptop": (255, 200, 0), "car": (0, 100, 255), "dog": (180, 0, 255), "cat": (255, 0, 180), "cup": (0, 255, 220), "book": (100, 255, 0), "phone": (255, 255, 0), "backpack":(255, 140, 0), "umbrella":(200, 0, 255), } DEFAULT_COLOUR = (200, 200, 200) # Known real-world heights in cm for distance estimation KNOWN_HEIGHTS_CM = { "person": 170, "door": 200, "chair": 90, "car": 150, "bottle": 25, "cup": 10, "laptop": 25, "stop sign": 75, } FOCAL_LENGTH_PX = 700 # approximate for standard webcam def estimate_distance(cls_name, bbox_height_px): """ Estimates distance using pinhole camera formula. distance = (real_height_cm × focal_length) / pixel_height Example: person is 170cm tall in real life. If they appear 340px tall in frame → distance = (170×700)/340 = 350cm = 3.5m """ real_h = KNOWN_HEIGHTS_CM.get(cls_name) if not real_h or bbox_height_px < 5: return None dist_cm = (real_h * FOCAL_LENGTH_PX) / bbox_height_px dist_m = dist_cm / 100 if dist_m < 0.8: return "within arm's reach" elif dist_m < 1.5: return f"{dist_m:.1f}m — very close" elif dist_m < 4.0: return f"{dist_m:.1f}m ahead" else: return f"{dist_m:.0f}m away" def draw_trapezoid(frame, x1, y1, x2, y2, colour): """ Draws a perspective trapezoid from the object down to the bottom centre of the frame — showing the path to the object. Wide at bottom (user position) → narrow at top (object position). The further the object, the narrower the top naturally is because YOLO gives a smaller box for distant objects. This gives an instant visual sense of depth and direction. """ h, w = frame.shape[:2] # Bottom centre of frame = where the user is standing bottom_left = (w // 2 - 40, h - 10) bottom_right = (w // 2 + 40, h - 10) # Top of trapezoid = bottom of the object's bounding box top_left = (x1, y2) top_right = (x2, y2) # Draw the two converging side lines # These are the "path lines" from user to object cv2.line(frame, bottom_left, top_left, colour, 1, cv2.LINE_AA) cv2.line(frame, bottom_right, top_right, colour, 1, cv2.LINE_AA) # Draw a semi-transparent filled trapezoid to show the path area pts = np.array([bottom_left, bottom_right, top_right, top_left], dtype=np.int32) overlay = frame.copy() cv2.fillPoly(overlay, [pts], colour) cv2.addWeighted(overlay, 0.08, frame, 0.92, 0, frame) return frame @spaces.GPU def analyse(image): if image is None: return None, "No image provided." # Move model to GPU for this request model.to("cuda") # Compress first max_width = 640 w, h = image.size if w > max_width: ratio = max_width / w image = image.resize((max_width, int(h * ratio)), Image.LANCZOS) # Convert PIL (RGB) → OpenCV (BGR) # PIL uses Red-Green-Blue, OpenCV uses Blue-Green-Red # We flip the channels with [:,:,::-1] frame = np.array(image) frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR) # Run YOLO — returns list of results results = model(frame, verbose=False, conf=0.45) annotated = frame.copy() detections = [] spoken_parts = [] if results[0].boxes is not None: for box in results[0].boxes: # Extract box coordinates x1, y1, x2, y2 = box.xyxy[0].cpu().numpy().astype(int) cls_name = model.names[int(box.cls[0])] confidence = float(box.conf[0]) colour = COLOURS.get(cls_name, DEFAULT_COLOUR) # ── Distance estimation ──────────────────────────── bbox_h = y2 - y1 distance = estimate_distance(cls_name, bbox_h) # ── Draw perspective trapezoid first (behind box) ── annotated = draw_trapezoid(annotated, x1, y1, x2, y2, colour) # ── Semi-transparent box fill ────────────────────── overlay = annotated.copy() cv2.rectangle(overlay, (x1,y1), (x2,y2), colour, -1) cv2.addWeighted(overlay, 0.12, annotated, 0.88, 0, annotated) # ── Box border ───────────────────────────────────── cv2.rectangle(annotated, (x1,y1), (x2,y2), colour, 2) # ── CCTV corner brackets ─────────────────────────── L, T = 16, 3 cv2.line(annotated,(x1,y1),(x1+L,y1),colour,T) cv2.line(annotated,(x1,y1),(x1,y1+L),colour,T) cv2.line(annotated,(x2,y1),(x2-L,y1),colour,T) cv2.line(annotated,(x2,y1),(x2,y1+L),colour,T) cv2.line(annotated,(x1,y2),(x1+L,y2),colour,T) cv2.line(annotated,(x1,y2),(x1,y2-L),colour,T) cv2.line(annotated,(x2,y2),(x2-L,y2),colour,T) cv2.line(annotated,(x2,y2),(x2,y2-L),colour,T) # ── Top label: CLASS | CONFIDENCE ───────────────── label = f"{cls_name.upper()} {confidence*100:.0f}%" font = cv2.FONT_HERSHEY_SIMPLEX (tw, th), _ = cv2.getTextSize(label, font, 0.55, 1) cv2.rectangle(annotated,(x1,y1-th-10),(x1+tw+8,y1),colour,-1) cv2.putText(annotated, label,(x1+4,y1-6), font, 0.55, (0,0,0), 1, cv2.LINE_AA) # ── Distance label below box ─────────────────────── if distance: cv2.putText(annotated, distance, (x1+4, y2+18), font, 0.45, colour, 1, cv2.LINE_AA) # ── Build detection record ───────────────────────── det_text = f"→ {cls_name} ({confidence*100:.0f}%)" if distance: det_text += f" | {distance}" detections.append(det_text) # ── Build spoken description part ────────────────── spoken = cls_name if distance: spoken += f", {distance}" spoken_parts.append(spoken) # ── Convert back to RGB for Gradio ──────────────────────── rgb = cv2.cvtColor(annotated, cv2.COLOR_BGR2RGB) # ── Build final output text ──────────────────────────────── if spoken_parts: spoken_sentence = "Navigator sees: " + ". ".join(spoken_parts) summary = "\n".join(detections) else: spoken_sentence = "Nothing clearly detected in this scene." summary = "No objects detected." output_text = f"Navigator would say:\n\"{spoken_sentence}\"\n\n[] Detections:\n{summary}" return Image.fromarray(rgb), output_text # ───────────────────────────────────────────────────────────── # GRADIO UI # ───────────────────────────────────────────────────────────── with gr.Blocks(title="Navigator — AI Sight Assistant") as demo: gr.Markdown(""" # Navigator — AI Sight Assistant **Real-time scene description for visually impaired users** > *"Empowering blind users to hear their surroundings through AI"* Built for the **AMD Developer Hackathon** · Track 3: Vision & Multimodal AI Powered by **YOLOv8 · LLaVA · FER · LiteRun optimizer · AMD MI300X + ROCm** """) gr.Markdown("---") # ── How it works status table ────────────────────────────── gr.Markdown(""" ### Live Demo Status | Module | Status | What it does | |--------|--------|--------------| | YOLOv8 object detection | Live | Finds objects in 50ms | | Distance estimation | Live | Calculates metres to each object | | Perspective path lines | Live | Draws converging lines to show depth | | FER emotion detection | Integrating | Reads facial expressions | | LLaVA scene description | Integrating | Deep natural language descriptions | | Text-to-speech output | Integrating | Speaks result into earpiece | | LiteRun memory optimizer | AMD cloud | Fits 16GB model into 6GB VRAM | """) gr.Markdown("---") gr.Markdown("### Try it — use your webcam or upload an image") with gr.Tabs(): # with gr.Tab("Live Webcam"): # with gr.Row(): # with gr.Column(scale=1): # webcam_inp = gr.Image( # sources=["webcam"], # type="pil", # label="Webcam", # streaming=True # ) # with gr.Column(scale=1): # webcam_out_img = gr.Image(label="Annotated Feed") # webcam_out_text = gr.Textbox( # label="Navigator Output", # lines=6 # ) # # Auto-analyse every 0.05 seconds from webcam # webcam_inp.stream( # fn=analyse, # inputs=webcam_inp, # outputs=[webcam_out_img, webcam_out_text], # time_limit=60, # stream_every=1.5 # ) with gr.Tab("Upload Image"): with gr.Row(): with gr.Column(scale=1): upload_inp = gr.Image( sources=["upload"], type="pil", label="Upload Image" ) btn = gr.Button("Analyse", variant="primary", size="lg") with gr.Column(scale=1): upload_out_img = gr.Image(label="Annotated Feed") upload_out_text = gr.Textbox( label="Navigator Output", lines=6 ) btn.click(fn=analyse, inputs=upload_inp, outputs=[upload_out_img, upload_out_text]) gr.Markdown("---") gr.Markdown(""" ### How Navigator works Your Camera ↓ YOLO detects every object (50ms — fast path) ↓ Distance calculated per object using pinhole camera model ↓ Perspective trapezoid drawn showing path to each object ↓ FER reads emotion of each detected person (coming soon) ↓ LLaVA describes full scene in natural language (coming soon) ↓ TTS speaks result into blind user's earpiece (coming soon) ### ⚡ LiteRun — Our Memory Optimizer LLaVA normally needs **16GB VRAM** to run. LiteRun squeezes it into **6GB** using quantization + memory routing + KV cache compression. **Before OptiLite:** crashes out of memory **After OptiLite:** runs smoothly on AMD hardware *Full real-time webcam demo with audio coming Day 7* 🔥 """) demo.launch()