import spaces

import gradio as gr
import cv2
import numpy as np
from ultralytics import YOLO
from PIL import Image

# Load YOLO once at startup
# yolov8n.pt = nano model, fastest, best for CPU
# Downloads automatically on first run (~6MB)
print("Loading YOLO...")
model = YOLO("yolov8n.pt")
model.to("cpu")  # keep on CPU at load time
print("YOLO ready")

# Colour map — BGR format (OpenCV uses Blue-Green-Red)
COLOURS = {
    "person":  (0, 200, 255),
    "chair":   (255, 100, 0),
    "bottle":  (0, 255, 100),
    "laptop":  (255, 200, 0),
    "car":     (0, 100, 255),
    "dog":     (180, 0, 255),
    "cat":     (255, 0, 180),
    "cup":     (0, 255, 220),
    "book":    (100, 255, 0),
    "phone":   (255, 255, 0),
    "backpack":(255, 140, 0),
    "umbrella":(200, 0, 255),
}
DEFAULT_COLOUR = (200, 200, 200)

# Known real-world heights in cm for distance estimation
KNOWN_HEIGHTS_CM = {
    "person": 170, "door": 200, "chair": 90,
    "car": 150, "bottle": 25, "cup": 10,
    "laptop": 25, "stop sign": 75,
}
FOCAL_LENGTH_PX = 700  # approximate for standard webcam

def estimate_distance(cls_name, bbox_height_px):
    """
    Estimates distance using pinhole camera formula.
    distance = (real_height_cm × focal_length) / pixel_height
    
    Example: person is 170cm tall in real life.
    If they appear 340px tall in frame → distance = (170×700)/340 = 350cm = 3.5m
    """
    real_h = KNOWN_HEIGHTS_CM.get(cls_name)
    if not real_h or bbox_height_px < 5:
        return None
    dist_cm = (real_h * FOCAL_LENGTH_PX) / bbox_height_px
    dist_m  = dist_cm / 100

    if dist_m < 0.8:
        return "within arm's reach"
    elif dist_m < 1.5:
        return f"{dist_m:.1f}m — very close"
    elif dist_m < 4.0:
        return f"{dist_m:.1f}m ahead"
    else:
        return f"{dist_m:.0f}m away"


def draw_trapezoid(frame, x1, y1, x2, y2, colour):
    """
    Draws a perspective trapezoid from the object down to the
    bottom centre of the frame — showing the path to the object.

    Wide at bottom (user position) → narrow at top (object position).
    The further the object, the narrower the top naturally is
    because YOLO gives a smaller box for distant objects.

    This gives an instant visual sense of depth and direction.
    """
    h, w = frame.shape[:2]

    # Bottom centre of frame = where the user is standing
    bottom_left  = (w // 2 - 40, h - 10)
    bottom_right = (w // 2 + 40, h - 10)

    # Top of trapezoid = bottom of the object's bounding box
    top_left  = (x1, y2)
    top_right = (x2, y2)

    # Draw the two converging side lines
    # These are the "path lines" from user to object
    cv2.line(frame, bottom_left,  top_left,  colour, 1, cv2.LINE_AA)
    cv2.line(frame, bottom_right, top_right, colour, 1, cv2.LINE_AA)

    # Draw a semi-transparent filled trapezoid to show the path area
    pts = np.array([bottom_left, bottom_right,
                    top_right,   top_left], dtype=np.int32)
    overlay = frame.copy()
    cv2.fillPoly(overlay, [pts], colour)
    cv2.addWeighted(overlay, 0.08, frame, 0.92, 0, frame)

    return frame

@spaces.GPU
def analyse(image):
    if image is None:
        return None, "No image provided."
    
    # Move model to GPU for this request
    model.to("cuda")
    
    # Compress first
    max_width = 640
    w, h = image.size
    if w > max_width:
        ratio = max_width / w
        image = image.resize((max_width, int(h * ratio)), Image.LANCZOS)

    # Convert PIL (RGB) → OpenCV (BGR)
    # PIL uses Red-Green-Blue, OpenCV uses Blue-Green-Red
    # We flip the channels with [:,:,::-1]
    frame = np.array(image)
    frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)

    # Run YOLO — returns list of results
    results = model(frame, verbose=False, conf=0.45)

    annotated    = frame.copy()
    detections   = []
    spoken_parts = []

    if results[0].boxes is not None:
        for box in results[0].boxes:
            # Extract box coordinates
            x1, y1, x2, y2 = box.xyxy[0].cpu().numpy().astype(int)
            cls_name   = model.names[int(box.cls[0])]
            confidence = float(box.conf[0])
            colour     = COLOURS.get(cls_name, DEFAULT_COLOUR)

            # ── Distance estimation ────────────────────────────
            bbox_h   = y2 - y1
            distance = estimate_distance(cls_name, bbox_h)

            # ── Draw perspective trapezoid first (behind box) ──
            annotated = draw_trapezoid(annotated, x1, y1, x2, y2, colour)

            # ── Semi-transparent box fill ──────────────────────
            overlay = annotated.copy()
            cv2.rectangle(overlay, (x1,y1), (x2,y2), colour, -1)
            cv2.addWeighted(overlay, 0.12, annotated, 0.88, 0, annotated)

            # ── Box border ─────────────────────────────────────
            cv2.rectangle(annotated, (x1,y1), (x2,y2), colour, 2)

            # ── CCTV corner brackets ───────────────────────────
            L, T = 16, 3
            cv2.line(annotated,(x1,y1),(x1+L,y1),colour,T)
            cv2.line(annotated,(x1,y1),(x1,y1+L),colour,T)
            cv2.line(annotated,(x2,y1),(x2-L,y1),colour,T)
            cv2.line(annotated,(x2,y1),(x2,y1+L),colour,T)
            cv2.line(annotated,(x1,y2),(x1+L,y2),colour,T)
            cv2.line(annotated,(x1,y2),(x1,y2-L),colour,T)
            cv2.line(annotated,(x2,y2),(x2-L,y2),colour,T)
            cv2.line(annotated,(x2,y2),(x2,y2-L),colour,T)

            # ── Top label: CLASS | CONFIDENCE ─────────────────
            label = f"{cls_name.upper()}  {confidence*100:.0f}%"
            font  = cv2.FONT_HERSHEY_SIMPLEX
            (tw, th), _ = cv2.getTextSize(label, font, 0.55, 1)
            cv2.rectangle(annotated,(x1,y1-th-10),(x1+tw+8,y1),colour,-1)
            cv2.putText(annotated, label,(x1+4,y1-6),
                       font, 0.55, (0,0,0), 1, cv2.LINE_AA)

            # ── Distance label below box ───────────────────────
            if distance:
                cv2.putText(annotated, distance,
                           (x1+4, y2+18),
                           font, 0.45, colour, 1, cv2.LINE_AA)

            # ── Build detection record ─────────────────────────
            det_text = f"→ {cls_name} ({confidence*100:.0f}%)"
            if distance:
                det_text += f" | {distance}"
            detections.append(det_text)

            # ── Build spoken description part ──────────────────
            spoken = cls_name
            if distance:
                spoken += f", {distance}"
            spoken_parts.append(spoken)

    # ── Convert back to RGB for Gradio ────────────────────────
    rgb = cv2.cvtColor(annotated, cv2.COLOR_BGR2RGB)

    # ── Build final output text ────────────────────────────────
    if spoken_parts:
        spoken_sentence = "Navigator sees: " + ". ".join(spoken_parts)
        summary = "\n".join(detections)
    else:
        spoken_sentence = "Nothing clearly detected in this scene."
        summary         = "No objects detected."

    output_text = f"Navigator would say:\n\"{spoken_sentence}\"\n\n[] Detections:\n{summary}"

    return Image.fromarray(rgb), output_text


# ─────────────────────────────────────────────────────────────
# GRADIO UI
# ─────────────────────────────────────────────────────────────
with gr.Blocks(title="Navigator — AI Sight Assistant") as demo:

    gr.Markdown("""
    # Navigator — AI Sight Assistant
    **Real-time scene description for visually impaired users**
    
    > *"Empowering blind users to hear their surroundings through AI"*
    
    Built for the **AMD Developer Hackathon** · Track 3: Vision & Multimodal AI  
    Powered by **YOLOv8 · LLaVA · FER · LiteRun optimizer · AMD MI300X + ROCm**
    """)

    gr.Markdown("---")

    # ── How it works status table ──────────────────────────────
    gr.Markdown("""
    ### Live Demo Status
    | Module | Status | What it does |
    |--------|--------|--------------|
    | YOLOv8 object detection | Live | Finds objects in 50ms |
    | Distance estimation | Live | Calculates metres to each object |
    | Perspective path lines | Live | Draws converging lines to show depth |
    | FER emotion detection | Integrating | Reads facial expressions |
    | LLaVA scene description | Integrating | Deep natural language descriptions |
    | Text-to-speech output | Integrating | Speaks result into earpiece |
    | LiteRun memory optimizer | AMD cloud | Fits 16GB model into 6GB VRAM |
    """)

    gr.Markdown("---")

    gr.Markdown("### Try it — use your webcam or upload an image")

    with gr.Tabs():
        
        # with gr.Tab("Live Webcam"):
        #     with gr.Row():
        #         with gr.Column(scale=1):
        #             webcam_inp = gr.Image(
        #                 sources=["webcam"],
        #                 type="pil",
        #                 label="Webcam",
        #                 streaming=True
        #             )
        #         with gr.Column(scale=1):
        #             webcam_out_img  = gr.Image(label="Annotated Feed")
        #             webcam_out_text = gr.Textbox(
        #                 label="Navigator Output",
        #                 lines=6
        #             )
            
        #     # Auto-analyse every 0.05 seconds from webcam
        #     webcam_inp.stream(
        #         fn=analyse,
        #         inputs=webcam_inp,
        #         outputs=[webcam_out_img, webcam_out_text],
        #         time_limit=60,
        #         stream_every=1.5
        #     )

        with gr.Tab("Upload Image"):
            with gr.Row():
                with gr.Column(scale=1):
                    upload_inp = gr.Image(
                        sources=["upload"],
                        type="pil",
                        label="Upload Image"
                    )
                    btn = gr.Button("Analyse", variant="primary", size="lg")
                with gr.Column(scale=1):
                    upload_out_img  = gr.Image(label="Annotated Feed")
                    upload_out_text = gr.Textbox(
                        label="Navigator Output",
                        lines=6
                    )
            
            btn.click(fn=analyse, inputs=upload_inp, outputs=[upload_out_img, upload_out_text])

    gr.Markdown("---")

    gr.Markdown("""
    ### How Navigator works

    Your Camera
        ↓
    YOLO detects every object (50ms — fast path)
        ↓
    Distance calculated per object using pinhole camera model
        ↓
    Perspective trapezoid drawn showing path to each object
        ↓
    FER reads emotion of each detected person (coming soon)
        ↓
    LLaVA describes full scene in natural language (coming soon)
        ↓
    TTS speaks result into blind user's earpiece (coming soon)
    ### ⚡ LiteRun — Our Memory Optimizer
    LLaVA normally needs **16GB VRAM** to run.  
    LiteRun squeezes it into **6GB** using quantization + memory routing + KV cache compression.  
    **Before OptiLite:** crashes out of memory  
    **After OptiLite:** runs smoothly on AMD hardware  
    
    *Full real-time webcam demo with audio coming Day 7* 🔥
    """)


demo.launch()