IRORUN's picture
remmove video stream not good enough cause of GPU allocated per-function-call
74f14ed
import spaces
import gradio as gr
import cv2
import numpy as np
from ultralytics import YOLO
from PIL import Image
# Load YOLO once at startup
# yolov8n.pt = nano model, fastest, best for CPU
# Downloads automatically on first run (~6MB)
print("Loading YOLO...")
model = YOLO("yolov8n.pt")
model.to("cpu") # keep on CPU at load time
print("YOLO ready")
# Colour map β€” BGR format (OpenCV uses Blue-Green-Red)
COLOURS = {
"person": (0, 200, 255),
"chair": (255, 100, 0),
"bottle": (0, 255, 100),
"laptop": (255, 200, 0),
"car": (0, 100, 255),
"dog": (180, 0, 255),
"cat": (255, 0, 180),
"cup": (0, 255, 220),
"book": (100, 255, 0),
"phone": (255, 255, 0),
"backpack":(255, 140, 0),
"umbrella":(200, 0, 255),
}
DEFAULT_COLOUR = (200, 200, 200)
# Known real-world heights in cm for distance estimation
KNOWN_HEIGHTS_CM = {
"person": 170, "door": 200, "chair": 90,
"car": 150, "bottle": 25, "cup": 10,
"laptop": 25, "stop sign": 75,
}
FOCAL_LENGTH_PX = 700 # approximate for standard webcam
def estimate_distance(cls_name, bbox_height_px):
"""
Estimates distance using pinhole camera formula.
distance = (real_height_cm Γ— focal_length) / pixel_height
Example: person is 170cm tall in real life.
If they appear 340px tall in frame β†’ distance = (170Γ—700)/340 = 350cm = 3.5m
"""
real_h = KNOWN_HEIGHTS_CM.get(cls_name)
if not real_h or bbox_height_px < 5:
return None
dist_cm = (real_h * FOCAL_LENGTH_PX) / bbox_height_px
dist_m = dist_cm / 100
if dist_m < 0.8:
return "within arm's reach"
elif dist_m < 1.5:
return f"{dist_m:.1f}m β€” very close"
elif dist_m < 4.0:
return f"{dist_m:.1f}m ahead"
else:
return f"{dist_m:.0f}m away"
def draw_trapezoid(frame, x1, y1, x2, y2, colour):
"""
Draws a perspective trapezoid from the object down to the
bottom centre of the frame β€” showing the path to the object.
Wide at bottom (user position) β†’ narrow at top (object position).
The further the object, the narrower the top naturally is
because YOLO gives a smaller box for distant objects.
This gives an instant visual sense of depth and direction.
"""
h, w = frame.shape[:2]
# Bottom centre of frame = where the user is standing
bottom_left = (w // 2 - 40, h - 10)
bottom_right = (w // 2 + 40, h - 10)
# Top of trapezoid = bottom of the object's bounding box
top_left = (x1, y2)
top_right = (x2, y2)
# Draw the two converging side lines
# These are the "path lines" from user to object
cv2.line(frame, bottom_left, top_left, colour, 1, cv2.LINE_AA)
cv2.line(frame, bottom_right, top_right, colour, 1, cv2.LINE_AA)
# Draw a semi-transparent filled trapezoid to show the path area
pts = np.array([bottom_left, bottom_right,
top_right, top_left], dtype=np.int32)
overlay = frame.copy()
cv2.fillPoly(overlay, [pts], colour)
cv2.addWeighted(overlay, 0.08, frame, 0.92, 0, frame)
return frame
@spaces.GPU
def analyse(image):
if image is None:
return None, "No image provided."
# Move model to GPU for this request
model.to("cuda")
# Compress first
max_width = 640
w, h = image.size
if w > max_width:
ratio = max_width / w
image = image.resize((max_width, int(h * ratio)), Image.LANCZOS)
# Convert PIL (RGB) β†’ OpenCV (BGR)
# PIL uses Red-Green-Blue, OpenCV uses Blue-Green-Red
# We flip the channels with [:,:,::-1]
frame = np.array(image)
frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
# Run YOLO β€” returns list of results
results = model(frame, verbose=False, conf=0.45)
annotated = frame.copy()
detections = []
spoken_parts = []
if results[0].boxes is not None:
for box in results[0].boxes:
# Extract box coordinates
x1, y1, x2, y2 = box.xyxy[0].cpu().numpy().astype(int)
cls_name = model.names[int(box.cls[0])]
confidence = float(box.conf[0])
colour = COLOURS.get(cls_name, DEFAULT_COLOUR)
# ── Distance estimation ────────────────────────────
bbox_h = y2 - y1
distance = estimate_distance(cls_name, bbox_h)
# ── Draw perspective trapezoid first (behind box) ──
annotated = draw_trapezoid(annotated, x1, y1, x2, y2, colour)
# ── Semi-transparent box fill ──────────────────────
overlay = annotated.copy()
cv2.rectangle(overlay, (x1,y1), (x2,y2), colour, -1)
cv2.addWeighted(overlay, 0.12, annotated, 0.88, 0, annotated)
# ── Box border ─────────────────────────────────────
cv2.rectangle(annotated, (x1,y1), (x2,y2), colour, 2)
# ── CCTV corner brackets ───────────────────────────
L, T = 16, 3
cv2.line(annotated,(x1,y1),(x1+L,y1),colour,T)
cv2.line(annotated,(x1,y1),(x1,y1+L),colour,T)
cv2.line(annotated,(x2,y1),(x2-L,y1),colour,T)
cv2.line(annotated,(x2,y1),(x2,y1+L),colour,T)
cv2.line(annotated,(x1,y2),(x1+L,y2),colour,T)
cv2.line(annotated,(x1,y2),(x1,y2-L),colour,T)
cv2.line(annotated,(x2,y2),(x2-L,y2),colour,T)
cv2.line(annotated,(x2,y2),(x2,y2-L),colour,T)
# ── Top label: CLASS | CONFIDENCE ─────────────────
label = f"{cls_name.upper()} {confidence*100:.0f}%"
font = cv2.FONT_HERSHEY_SIMPLEX
(tw, th), _ = cv2.getTextSize(label, font, 0.55, 1)
cv2.rectangle(annotated,(x1,y1-th-10),(x1+tw+8,y1),colour,-1)
cv2.putText(annotated, label,(x1+4,y1-6),
font, 0.55, (0,0,0), 1, cv2.LINE_AA)
# ── Distance label below box ───────────────────────
if distance:
cv2.putText(annotated, distance,
(x1+4, y2+18),
font, 0.45, colour, 1, cv2.LINE_AA)
# ── Build detection record ─────────────────────────
det_text = f"β†’ {cls_name} ({confidence*100:.0f}%)"
if distance:
det_text += f" | {distance}"
detections.append(det_text)
# ── Build spoken description part ──────────────────
spoken = cls_name
if distance:
spoken += f", {distance}"
spoken_parts.append(spoken)
# ── Convert back to RGB for Gradio ────────────────────────
rgb = cv2.cvtColor(annotated, cv2.COLOR_BGR2RGB)
# ── Build final output text ────────────────────────────────
if spoken_parts:
spoken_sentence = "Navigator sees: " + ". ".join(spoken_parts)
summary = "\n".join(detections)
else:
spoken_sentence = "Nothing clearly detected in this scene."
summary = "No objects detected."
output_text = f"Navigator would say:\n\"{spoken_sentence}\"\n\n[] Detections:\n{summary}"
return Image.fromarray(rgb), output_text
# ─────────────────────────────────────────────────────────────
# GRADIO UI
# ─────────────────────────────────────────────────────────────
with gr.Blocks(title="Navigator β€” AI Sight Assistant") as demo:
gr.Markdown("""
# Navigator β€” AI Sight Assistant
**Real-time scene description for visually impaired users**
> *"Empowering blind users to hear their surroundings through AI"*
Built for the **AMD Developer Hackathon** Β· Track 3: Vision & Multimodal AI
Powered by **YOLOv8 Β· LLaVA Β· FER Β· LiteRun optimizer Β· AMD MI300X + ROCm**
""")
gr.Markdown("---")
# ── How it works status table ──────────────────────────────
gr.Markdown("""
### Live Demo Status
| Module | Status | What it does |
|--------|--------|--------------|
| YOLOv8 object detection | Live | Finds objects in 50ms |
| Distance estimation | Live | Calculates metres to each object |
| Perspective path lines | Live | Draws converging lines to show depth |
| FER emotion detection | Integrating | Reads facial expressions |
| LLaVA scene description | Integrating | Deep natural language descriptions |
| Text-to-speech output | Integrating | Speaks result into earpiece |
| LiteRun memory optimizer | AMD cloud | Fits 16GB model into 6GB VRAM |
""")
gr.Markdown("---")
gr.Markdown("### Try it β€” use your webcam or upload an image")
with gr.Tabs():
# with gr.Tab("Live Webcam"):
# with gr.Row():
# with gr.Column(scale=1):
# webcam_inp = gr.Image(
# sources=["webcam"],
# type="pil",
# label="Webcam",
# streaming=True
# )
# with gr.Column(scale=1):
# webcam_out_img = gr.Image(label="Annotated Feed")
# webcam_out_text = gr.Textbox(
# label="Navigator Output",
# lines=6
# )
# # Auto-analyse every 0.05 seconds from webcam
# webcam_inp.stream(
# fn=analyse,
# inputs=webcam_inp,
# outputs=[webcam_out_img, webcam_out_text],
# time_limit=60,
# stream_every=1.5
# )
with gr.Tab("Upload Image"):
with gr.Row():
with gr.Column(scale=1):
upload_inp = gr.Image(
sources=["upload"],
type="pil",
label="Upload Image"
)
btn = gr.Button("Analyse", variant="primary", size="lg")
with gr.Column(scale=1):
upload_out_img = gr.Image(label="Annotated Feed")
upload_out_text = gr.Textbox(
label="Navigator Output",
lines=6
)
btn.click(fn=analyse, inputs=upload_inp, outputs=[upload_out_img, upload_out_text])
gr.Markdown("---")
gr.Markdown("""
### How Navigator works
Your Camera
↓
YOLO detects every object (50ms β€” fast path)
↓
Distance calculated per object using pinhole camera model
↓
Perspective trapezoid drawn showing path to each object
↓
FER reads emotion of each detected person (coming soon)
↓
LLaVA describes full scene in natural language (coming soon)
↓
TTS speaks result into blind user's earpiece (coming soon)
### ⚑ LiteRun β€” Our Memory Optimizer
LLaVA normally needs **16GB VRAM** to run.
LiteRun squeezes it into **6GB** using quantization + memory routing + KV cache compression.
**Before OptiLite:** crashes out of memory
**After OptiLite:** runs smoothly on AMD hardware
*Full real-time webcam demo with audio coming Day 7* πŸ”₯
""")
demo.launch()