Spaces:
Build error
Build error
Upload 6 files
Browse files- Dockerfile +38 -0
- README.md +39 -5
- app.py +201 -0
- best.pt +3 -0
- detections.parquet +3 -0
- requirements.txt +9 -0
Dockerfile
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Use official Python runtime as base image
|
| 2 |
+
FROM python:3.10-slim
|
| 3 |
+
|
| 4 |
+
# Set working directory
|
| 5 |
+
WORKDIR /app
|
| 6 |
+
|
| 7 |
+
# Install system dependencies
|
| 8 |
+
RUN apt-get update && apt-get install -y \
|
| 9 |
+
libgl1-mesa-glx \
|
| 10 |
+
libglib2.0-0 \
|
| 11 |
+
libsm6 \
|
| 12 |
+
libxext6 \
|
| 13 |
+
libxrender-dev \
|
| 14 |
+
libgomp1 \
|
| 15 |
+
wget \
|
| 16 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 17 |
+
|
| 18 |
+
# Copy requirements first for better caching
|
| 19 |
+
COPY requirements.txt .
|
| 20 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 21 |
+
|
| 22 |
+
# Copy application files
|
| 23 |
+
COPY app.py .
|
| 24 |
+
COPY best.pt .
|
| 25 |
+
COPY detections.parquet .
|
| 26 |
+
|
| 27 |
+
# Create directory for video (will be downloaded at runtime)
|
| 28 |
+
RUN mkdir -p /app/data
|
| 29 |
+
|
| 30 |
+
# Expose port for Gradio
|
| 31 |
+
EXPOSE 7860
|
| 32 |
+
|
| 33 |
+
# Set environment variables
|
| 34 |
+
ENV GRADIO_SERVER_NAME="0.0.0.0"
|
| 35 |
+
ENV GRADIO_SERVER_PORT=7860
|
| 36 |
+
|
| 37 |
+
# Run the application
|
| 38 |
+
CMD ["python", "app.py"]
|
README.md
CHANGED
|
@@ -1,11 +1,45 @@
|
|
| 1 |
---
|
| 2 |
-
title:
|
| 3 |
-
emoji:
|
| 4 |
colorFrom: blue
|
| 5 |
-
colorTo:
|
| 6 |
sdk: docker
|
| 7 |
pinned: false
|
| 8 |
-
|
| 9 |
---
|
| 10 |
|
| 11 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
+
title: Car Parts Image-to-Video Retrieval
|
| 3 |
+
emoji: π
|
| 4 |
colorFrom: blue
|
| 5 |
+
colorTo: green
|
| 6 |
sdk: docker
|
| 7 |
pinned: false
|
| 8 |
+
license: mit
|
| 9 |
---
|
| 10 |
|
| 11 |
+
# Car Parts Image-to-Video Retrieval System
|
| 12 |
+
|
| 13 |
+
An intelligent system that detects car parts in images and retrieves matching video clips from an indexed automotive video.
|
| 14 |
+
|
| 15 |
+
## Features
|
| 16 |
+
|
| 17 |
+
- **YOLOv26s Detection**: Fine-tuned on car parts dataset
|
| 18 |
+
- **Semantic Matching**: Identifies doors, wheels, headlights, mirrors, bumpers, and more
|
| 19 |
+
- **Temporal Retrieval**: Returns precise video clip timestamps
|
| 20 |
+
- **Interactive Demo**: Upload any car image and find matching video segments
|
| 21 |
+
|
| 22 |
+
## How to Use
|
| 23 |
+
|
| 24 |
+
1. Upload an image containing car parts
|
| 25 |
+
2. The system detects all visible components
|
| 26 |
+
3. View matching video clips with timestamps
|
| 27 |
+
4. Each clip shows where that component appears in the source video
|
| 28 |
+
|
| 29 |
+
## Technical Details
|
| 30 |
+
|
| 31 |
+
- **Model**: YOLOv26s (small variant) fine-tuned for car part detection
|
| 32 |
+
- **Video Index**: Pre-computed detection index with bounding boxes and timestamps
|
| 33 |
+
- **Sampling Strategy**: Every 5th frame (4.8-6 FPS effective rate)
|
| 34 |
+
- **Clip Formation**: 3.0s gap threshold for temporal merging
|
| 35 |
+
|
| 36 |
+
## Assignment Context
|
| 37 |
+
|
| 38 |
+
This demo is part of **Assignment 2** for CS-UY 4613 Artificial Intelligence (Spring 2026).
|
| 39 |
+
|
| 40 |
+
**Student**: Hanze (James) Qiu
|
| 41 |
+
**Repository**: [github.com/JamesQiu2005/CS-UY_4613_Assignments](https://github.com/JamesQiu2005/CS-UY_4613_Assignments)
|
| 42 |
+
|
| 43 |
+
---
|
| 44 |
+
|
| 45 |
+
Built with Ultralytics YOLO, OpenCV, and Gradio.
|
app.py
ADDED
|
@@ -0,0 +1,201 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
import pandas as pd
|
| 3 |
+
from ultralytics import YOLO
|
| 4 |
+
import cv2
|
| 5 |
+
import numpy as np
|
| 6 |
+
from PIL import Image, ImageDraw
|
| 7 |
+
import os
|
| 8 |
+
|
| 9 |
+
# Load model and detection index
|
| 10 |
+
print("Loading model and detection index...")
|
| 11 |
+
model = YOLO("best.pt")
|
| 12 |
+
detection_df = pd.read_parquet("detections.parquet")
|
| 13 |
+
|
| 14 |
+
# Video path (you may need to download this at runtime or use URL)
|
| 15 |
+
VIDEO_PATH = "data/videoplayback.mp4"
|
| 16 |
+
VIDEO_URL = "YOUR_VIDEO_URL_HERE" # Replace with actual video URL or YouTube link
|
| 17 |
+
|
| 18 |
+
def download_video_if_needed():
|
| 19 |
+
"""Download video if not present"""
|
| 20 |
+
if not os.path.exists(VIDEO_PATH):
|
| 21 |
+
print(f"Video not found at {VIDEO_PATH}")
|
| 22 |
+
print("Please upload video or provide YouTube URL")
|
| 23 |
+
# You can add yt-dlp here to download from YouTube
|
| 24 |
+
return False
|
| 25 |
+
return True
|
| 26 |
+
|
| 27 |
+
def merge_intervals(timestamps, gap_threshold=3.0):
|
| 28 |
+
"""Merge nearby timestamps into contiguous clips"""
|
| 29 |
+
if not timestamps:
|
| 30 |
+
return []
|
| 31 |
+
|
| 32 |
+
timestamps = sorted(list(set(timestamps)))
|
| 33 |
+
clips = []
|
| 34 |
+
start = timestamps[0]
|
| 35 |
+
prev = timestamps[0]
|
| 36 |
+
|
| 37 |
+
for t in timestamps[1:]:
|
| 38 |
+
if t - prev > gap_threshold:
|
| 39 |
+
clips.append((start, prev))
|
| 40 |
+
start = t
|
| 41 |
+
prev = t
|
| 42 |
+
|
| 43 |
+
clips.append((start, prev))
|
| 44 |
+
return clips
|
| 45 |
+
|
| 46 |
+
def retrieve_clips(query_image):
|
| 47 |
+
"""Main retrieval function"""
|
| 48 |
+
if query_image is None:
|
| 49 |
+
return "Please upload an image", None, None
|
| 50 |
+
|
| 51 |
+
# Convert to PIL if needed
|
| 52 |
+
if isinstance(query_image, np.ndarray):
|
| 53 |
+
query_image = Image.fromarray(query_image)
|
| 54 |
+
|
| 55 |
+
# Detect components in query image
|
| 56 |
+
results = model(query_image, verbose=False)[0]
|
| 57 |
+
|
| 58 |
+
if len(results.boxes) == 0:
|
| 59 |
+
return "No car parts detected in the image", query_image, None
|
| 60 |
+
|
| 61 |
+
# Draw boxes on query image
|
| 62 |
+
query_draw = query_image.copy()
|
| 63 |
+
draw = ImageDraw.Draw(query_draw)
|
| 64 |
+
|
| 65 |
+
retrieval_info = []
|
| 66 |
+
all_clips = []
|
| 67 |
+
|
| 68 |
+
# Process each detected component
|
| 69 |
+
for box_idx in range(len(results.boxes)):
|
| 70 |
+
cls_id = int(results.boxes.cls[box_idx])
|
| 71 |
+
cls_name = model.names[cls_id]
|
| 72 |
+
conf = float(results.boxes.conf[box_idx])
|
| 73 |
+
bbox = results.boxes.xyxy[box_idx].tolist()
|
| 74 |
+
|
| 75 |
+
if conf < 0.5:
|
| 76 |
+
continue
|
| 77 |
+
|
| 78 |
+
# Draw bounding box
|
| 79 |
+
x1, y1, x2, y2 = bbox
|
| 80 |
+
draw.rectangle([x1, y1, x2, y2], outline='red', width=3)
|
| 81 |
+
draw.text((x1, y1-20), f"{cls_name} ({conf:.2f})", fill='red')
|
| 82 |
+
|
| 83 |
+
# Search detection index
|
| 84 |
+
matches = detection_df[detection_df['class_label'] == cls_name]
|
| 85 |
+
matches = matches[matches['confidence_score'] > 0.5]
|
| 86 |
+
|
| 87 |
+
if len(matches) == 0:
|
| 88 |
+
retrieval_info.append(f"β {cls_name}: No matches found")
|
| 89 |
+
continue
|
| 90 |
+
|
| 91 |
+
# Merge into clips
|
| 92 |
+
timestamps = matches['timestamp'].tolist()
|
| 93 |
+
clips = merge_intervals(timestamps, gap_threshold=3.0)
|
| 94 |
+
|
| 95 |
+
retrieval_info.append(
|
| 96 |
+
f"β
{cls_name} (conf: {conf:.2%}): {len(clips)} clips, {len(matches)} frames"
|
| 97 |
+
)
|
| 98 |
+
|
| 99 |
+
for start, end in clips[:3]: # Limit to first 3 clips per component
|
| 100 |
+
all_clips.append({
|
| 101 |
+
'component': cls_name,
|
| 102 |
+
'start': start,
|
| 103 |
+
'end': end,
|
| 104 |
+
'duration': end - start
|
| 105 |
+
})
|
| 106 |
+
|
| 107 |
+
info_text = "\n".join(retrieval_info)
|
| 108 |
+
|
| 109 |
+
# Create clips table
|
| 110 |
+
if all_clips:
|
| 111 |
+
clips_df = pd.DataFrame(all_clips)
|
| 112 |
+
return info_text, query_draw, clips_df
|
| 113 |
+
else:
|
| 114 |
+
return info_text, query_draw, None
|
| 115 |
+
|
| 116 |
+
def extract_frame(component, start_time):
|
| 117 |
+
"""Extract a frame from video at given timestamp"""
|
| 118 |
+
if not download_video_if_needed():
|
| 119 |
+
return None
|
| 120 |
+
|
| 121 |
+
cap = cv2.VideoCapture(VIDEO_PATH)
|
| 122 |
+
fps = cap.get(cv2.CAP_PROP_FPS)
|
| 123 |
+
frame_num = int(start_time * fps)
|
| 124 |
+
|
| 125 |
+
cap.set(cv2.CAP_PROP_POS_FRAMES, frame_num)
|
| 126 |
+
ret, frame = cap.read()
|
| 127 |
+
cap.release()
|
| 128 |
+
|
| 129 |
+
if ret:
|
| 130 |
+
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
|
| 131 |
+
return Image.fromarray(frame_rgb)
|
| 132 |
+
return None
|
| 133 |
+
|
| 134 |
+
# Create Gradio interface
|
| 135 |
+
with gr.Blocks(title="Image-to-Video Retrieval Demo") as demo:
|
| 136 |
+
gr.Markdown("""
|
| 137 |
+
# π Car Parts Image-to-Video Retrieval System
|
| 138 |
+
|
| 139 |
+
Upload an image of a car part, and this system will find matching video clips!
|
| 140 |
+
|
| 141 |
+
**How it works:**
|
| 142 |
+
1. Upload a car image (doors, wheels, headlights, etc.)
|
| 143 |
+
2. YOLOv26s detects all car parts in your image
|
| 144 |
+
3. System retrieves matching video clips from the indexed video
|
| 145 |
+
4. View timestamps and sample frames
|
| 146 |
+
|
| 147 |
+
**Supported Components:** Doors, wheels, headlights, mirrors, bumpers, and more!
|
| 148 |
+
""")
|
| 149 |
+
|
| 150 |
+
with gr.Row():
|
| 151 |
+
with gr.Column(scale=1):
|
| 152 |
+
input_image = gr.Image(type="pil", label="Upload Query Image")
|
| 153 |
+
search_btn = gr.Button("π Search Video", variant="primary")
|
| 154 |
+
|
| 155 |
+
with gr.Column(scale=1):
|
| 156 |
+
output_image = gr.Image(type="pil", label="Detected Components")
|
| 157 |
+
output_text = gr.Textbox(label="Retrieval Results", lines=8)
|
| 158 |
+
|
| 159 |
+
with gr.Row():
|
| 160 |
+
output_table = gr.Dataframe(
|
| 161 |
+
label="Matching Video Clips",
|
| 162 |
+
headers=["component", "start", "end", "duration"]
|
| 163 |
+
)
|
| 164 |
+
|
| 165 |
+
gr.Markdown("""
|
| 166 |
+
---
|
| 167 |
+
### π Technical Details
|
| 168 |
+
- **Model:** YOLOv26s fine-tuned on car parts dataset
|
| 169 |
+
- **Video Sampling:** Every 5th frame
|
| 170 |
+
- **Matching:** Semantic component matching with confidence β₯ 0.5
|
| 171 |
+
- **Clip Formation:** 3.0s gap threshold for temporal merging
|
| 172 |
+
|
| 173 |
+
**Assignment 2 - CS-UY 4613 Artificial Intelligence**
|
| 174 |
+
Hanze (James) Qiu | Spring 2026
|
| 175 |
+
""")
|
| 176 |
+
|
| 177 |
+
# Connect button
|
| 178 |
+
search_btn.click(
|
| 179 |
+
fn=retrieve_clips,
|
| 180 |
+
inputs=[input_image],
|
| 181 |
+
outputs=[output_text, output_image, output_table]
|
| 182 |
+
)
|
| 183 |
+
|
| 184 |
+
# Example images (optional - add paths to example images)
|
| 185 |
+
gr.Examples(
|
| 186 |
+
examples=[
|
| 187 |
+
# Add paths to example images if you have them
|
| 188 |
+
# ["examples/car1.jpg"],
|
| 189 |
+
# ["examples/car2.jpg"],
|
| 190 |
+
],
|
| 191 |
+
inputs=input_image,
|
| 192 |
+
label="Example Query Images"
|
| 193 |
+
)
|
| 194 |
+
|
| 195 |
+
if __name__ == "__main__":
|
| 196 |
+
print("Starting Gradio app...")
|
| 197 |
+
demo.launch(
|
| 198 |
+
server_name="0.0.0.0",
|
| 199 |
+
server_port=7860,
|
| 200 |
+
share=False
|
| 201 |
+
)
|
best.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:77c3e8dc01a08bd99cc5b1f301f4237bd09a79d991b9eb5ec4635065c3feb110
|
| 3 |
+
size 20343045
|
detections.parquet
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e82da2dcb9eada79ba0c3c94256e0f88fe0dfa151437e271e860695f23a500e4
|
| 3 |
+
size 1400297
|
requirements.txt
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
torch>=2.0.0
|
| 2 |
+
ultralytics>=8.0.0
|
| 3 |
+
opencv-python-headless==4.8.1.78
|
| 4 |
+
pandas>=2.0.0
|
| 5 |
+
pyarrow>=14.0.0
|
| 6 |
+
datasets>=2.16.0
|
| 7 |
+
gradio>=4.0.0
|
| 8 |
+
Pillow>=10.0.0
|
| 9 |
+
numpy>=1.24.0
|