Spaces:
Running on Zero
Running on Zero
| import spaces | |
| import gradio as gr | |
| import cv2 | |
| import numpy as np | |
| from ultralytics import YOLO | |
| from PIL import Image | |
| # Load YOLO once at startup | |
| # yolov8n.pt = nano model, fastest, best for CPU | |
| # Downloads automatically on first run (~6MB) | |
| print("Loading YOLO...") | |
| model = YOLO("yolov8n.pt") | |
| model.to("cpu") # keep on CPU at load time | |
| print("YOLO ready") | |
| # Colour map β BGR format (OpenCV uses Blue-Green-Red) | |
| COLOURS = { | |
| "person": (0, 200, 255), | |
| "chair": (255, 100, 0), | |
| "bottle": (0, 255, 100), | |
| "laptop": (255, 200, 0), | |
| "car": (0, 100, 255), | |
| "dog": (180, 0, 255), | |
| "cat": (255, 0, 180), | |
| "cup": (0, 255, 220), | |
| "book": (100, 255, 0), | |
| "phone": (255, 255, 0), | |
| "backpack":(255, 140, 0), | |
| "umbrella":(200, 0, 255), | |
| } | |
| DEFAULT_COLOUR = (200, 200, 200) | |
| # Known real-world heights in cm for distance estimation | |
| KNOWN_HEIGHTS_CM = { | |
| "person": 170, "door": 200, "chair": 90, | |
| "car": 150, "bottle": 25, "cup": 10, | |
| "laptop": 25, "stop sign": 75, | |
| } | |
| FOCAL_LENGTH_PX = 700 # approximate for standard webcam | |
| def estimate_distance(cls_name, bbox_height_px): | |
| """ | |
| Estimates distance using pinhole camera formula. | |
| distance = (real_height_cm Γ focal_length) / pixel_height | |
| Example: person is 170cm tall in real life. | |
| If they appear 340px tall in frame β distance = (170Γ700)/340 = 350cm = 3.5m | |
| """ | |
| real_h = KNOWN_HEIGHTS_CM.get(cls_name) | |
| if not real_h or bbox_height_px < 5: | |
| return None | |
| dist_cm = (real_h * FOCAL_LENGTH_PX) / bbox_height_px | |
| dist_m = dist_cm / 100 | |
| if dist_m < 0.8: | |
| return "within arm's reach" | |
| elif dist_m < 1.5: | |
| return f"{dist_m:.1f}m β very close" | |
| elif dist_m < 4.0: | |
| return f"{dist_m:.1f}m ahead" | |
| else: | |
| return f"{dist_m:.0f}m away" | |
| def draw_trapezoid(frame, x1, y1, x2, y2, colour): | |
| """ | |
| Draws a perspective trapezoid from the object down to the | |
| bottom centre of the frame β showing the path to the object. | |
| Wide at bottom (user position) β narrow at top (object position). | |
| The further the object, the narrower the top naturally is | |
| because YOLO gives a smaller box for distant objects. | |
| This gives an instant visual sense of depth and direction. | |
| """ | |
| h, w = frame.shape[:2] | |
| # Bottom centre of frame = where the user is standing | |
| bottom_left = (w // 2 - 40, h - 10) | |
| bottom_right = (w // 2 + 40, h - 10) | |
| # Top of trapezoid = bottom of the object's bounding box | |
| top_left = (x1, y2) | |
| top_right = (x2, y2) | |
| # Draw the two converging side lines | |
| # These are the "path lines" from user to object | |
| cv2.line(frame, bottom_left, top_left, colour, 1, cv2.LINE_AA) | |
| cv2.line(frame, bottom_right, top_right, colour, 1, cv2.LINE_AA) | |
| # Draw a semi-transparent filled trapezoid to show the path area | |
| pts = np.array([bottom_left, bottom_right, | |
| top_right, top_left], dtype=np.int32) | |
| overlay = frame.copy() | |
| cv2.fillPoly(overlay, [pts], colour) | |
| cv2.addWeighted(overlay, 0.08, frame, 0.92, 0, frame) | |
| return frame | |
| def analyse(image): | |
| if image is None: | |
| return None, "No image provided." | |
| # Move model to GPU for this request | |
| model.to("cuda") | |
| # Compress first | |
| max_width = 640 | |
| w, h = image.size | |
| if w > max_width: | |
| ratio = max_width / w | |
| image = image.resize((max_width, int(h * ratio)), Image.LANCZOS) | |
| # Convert PIL (RGB) β OpenCV (BGR) | |
| # PIL uses Red-Green-Blue, OpenCV uses Blue-Green-Red | |
| # We flip the channels with [:,:,::-1] | |
| frame = np.array(image) | |
| frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR) | |
| # Run YOLO β returns list of results | |
| results = model(frame, verbose=False, conf=0.45) | |
| annotated = frame.copy() | |
| detections = [] | |
| spoken_parts = [] | |
| if results[0].boxes is not None: | |
| for box in results[0].boxes: | |
| # Extract box coordinates | |
| x1, y1, x2, y2 = box.xyxy[0].cpu().numpy().astype(int) | |
| cls_name = model.names[int(box.cls[0])] | |
| confidence = float(box.conf[0]) | |
| colour = COLOURS.get(cls_name, DEFAULT_COLOUR) | |
| # ββ Distance estimation ββββββββββββββββββββββββββββ | |
| bbox_h = y2 - y1 | |
| distance = estimate_distance(cls_name, bbox_h) | |
| # ββ Draw perspective trapezoid first (behind box) ββ | |
| annotated = draw_trapezoid(annotated, x1, y1, x2, y2, colour) | |
| # ββ Semi-transparent box fill ββββββββββββββββββββββ | |
| overlay = annotated.copy() | |
| cv2.rectangle(overlay, (x1,y1), (x2,y2), colour, -1) | |
| cv2.addWeighted(overlay, 0.12, annotated, 0.88, 0, annotated) | |
| # ββ Box border βββββββββββββββββββββββββββββββββββββ | |
| cv2.rectangle(annotated, (x1,y1), (x2,y2), colour, 2) | |
| # ββ CCTV corner brackets βββββββββββββββββββββββββββ | |
| L, T = 16, 3 | |
| cv2.line(annotated,(x1,y1),(x1+L,y1),colour,T) | |
| cv2.line(annotated,(x1,y1),(x1,y1+L),colour,T) | |
| cv2.line(annotated,(x2,y1),(x2-L,y1),colour,T) | |
| cv2.line(annotated,(x2,y1),(x2,y1+L),colour,T) | |
| cv2.line(annotated,(x1,y2),(x1+L,y2),colour,T) | |
| cv2.line(annotated,(x1,y2),(x1,y2-L),colour,T) | |
| cv2.line(annotated,(x2,y2),(x2-L,y2),colour,T) | |
| cv2.line(annotated,(x2,y2),(x2,y2-L),colour,T) | |
| # ββ Top label: CLASS | CONFIDENCE βββββββββββββββββ | |
| label = f"{cls_name.upper()} {confidence*100:.0f}%" | |
| font = cv2.FONT_HERSHEY_SIMPLEX | |
| (tw, th), _ = cv2.getTextSize(label, font, 0.55, 1) | |
| cv2.rectangle(annotated,(x1,y1-th-10),(x1+tw+8,y1),colour,-1) | |
| cv2.putText(annotated, label,(x1+4,y1-6), | |
| font, 0.55, (0,0,0), 1, cv2.LINE_AA) | |
| # ββ Distance label below box βββββββββββββββββββββββ | |
| if distance: | |
| cv2.putText(annotated, distance, | |
| (x1+4, y2+18), | |
| font, 0.45, colour, 1, cv2.LINE_AA) | |
| # ββ Build detection record βββββββββββββββββββββββββ | |
| det_text = f"β {cls_name} ({confidence*100:.0f}%)" | |
| if distance: | |
| det_text += f" | {distance}" | |
| detections.append(det_text) | |
| # ββ Build spoken description part ββββββββββββββββββ | |
| spoken = cls_name | |
| if distance: | |
| spoken += f", {distance}" | |
| spoken_parts.append(spoken) | |
| # ββ Convert back to RGB for Gradio ββββββββββββββββββββββββ | |
| rgb = cv2.cvtColor(annotated, cv2.COLOR_BGR2RGB) | |
| # ββ Build final output text ββββββββββββββββββββββββββββββββ | |
| if spoken_parts: | |
| spoken_sentence = "Navigator sees: " + ". ".join(spoken_parts) | |
| summary = "\n".join(detections) | |
| else: | |
| spoken_sentence = "Nothing clearly detected in this scene." | |
| summary = "No objects detected." | |
| output_text = f"Navigator would say:\n\"{spoken_sentence}\"\n\n[] Detections:\n{summary}" | |
| return Image.fromarray(rgb), output_text | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # GRADIO UI | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| with gr.Blocks(title="Navigator β AI Sight Assistant") as demo: | |
| gr.Markdown(""" | |
| # Navigator β AI Sight Assistant | |
| **Real-time scene description for visually impaired users** | |
| > *"Empowering blind users to hear their surroundings through AI"* | |
| Built for the **AMD Developer Hackathon** Β· Track 3: Vision & Multimodal AI | |
| Powered by **YOLOv8 Β· LLaVA Β· FER Β· LiteRun optimizer Β· AMD MI300X + ROCm** | |
| """) | |
| gr.Markdown("---") | |
| # ββ How it works status table ββββββββββββββββββββββββββββββ | |
| gr.Markdown(""" | |
| ### Live Demo Status | |
| | Module | Status | What it does | | |
| |--------|--------|--------------| | |
| | YOLOv8 object detection | Live | Finds objects in 50ms | | |
| | Distance estimation | Live | Calculates metres to each object | | |
| | Perspective path lines | Live | Draws converging lines to show depth | | |
| | FER emotion detection | Integrating | Reads facial expressions | | |
| | LLaVA scene description | Integrating | Deep natural language descriptions | | |
| | Text-to-speech output | Integrating | Speaks result into earpiece | | |
| | LiteRun memory optimizer | AMD cloud | Fits 16GB model into 6GB VRAM | | |
| """) | |
| gr.Markdown("---") | |
| gr.Markdown("### Try it β use your webcam or upload an image") | |
| with gr.Tabs(): | |
| # with gr.Tab("Live Webcam"): | |
| # with gr.Row(): | |
| # with gr.Column(scale=1): | |
| # webcam_inp = gr.Image( | |
| # sources=["webcam"], | |
| # type="pil", | |
| # label="Webcam", | |
| # streaming=True | |
| # ) | |
| # with gr.Column(scale=1): | |
| # webcam_out_img = gr.Image(label="Annotated Feed") | |
| # webcam_out_text = gr.Textbox( | |
| # label="Navigator Output", | |
| # lines=6 | |
| # ) | |
| # # Auto-analyse every 0.05 seconds from webcam | |
| # webcam_inp.stream( | |
| # fn=analyse, | |
| # inputs=webcam_inp, | |
| # outputs=[webcam_out_img, webcam_out_text], | |
| # time_limit=60, | |
| # stream_every=1.5 | |
| # ) | |
| with gr.Tab("Upload Image"): | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| upload_inp = gr.Image( | |
| sources=["upload"], | |
| type="pil", | |
| label="Upload Image" | |
| ) | |
| btn = gr.Button("Analyse", variant="primary", size="lg") | |
| with gr.Column(scale=1): | |
| upload_out_img = gr.Image(label="Annotated Feed") | |
| upload_out_text = gr.Textbox( | |
| label="Navigator Output", | |
| lines=6 | |
| ) | |
| btn.click(fn=analyse, inputs=upload_inp, outputs=[upload_out_img, upload_out_text]) | |
| gr.Markdown("---") | |
| gr.Markdown(""" | |
| ### How Navigator works | |
| Your Camera | |
| β | |
| YOLO detects every object (50ms β fast path) | |
| β | |
| Distance calculated per object using pinhole camera model | |
| β | |
| Perspective trapezoid drawn showing path to each object | |
| β | |
| FER reads emotion of each detected person (coming soon) | |
| β | |
| LLaVA describes full scene in natural language (coming soon) | |
| β | |
| TTS speaks result into blind user's earpiece (coming soon) | |
| ### β‘ LiteRun β Our Memory Optimizer | |
| LLaVA normally needs **16GB VRAM** to run. | |
| LiteRun squeezes it into **6GB** using quantization + memory routing + KV cache compression. | |
| **Before OptiLite:** crashes out of memory | |
| **After OptiLite:** runs smoothly on AMD hardware | |
| *Full real-time webcam demo with audio coming Day 7* π₯ | |
| """) | |
| demo.launch() |