Spaces:

prithivMLmods
/

DocScope-R1

Sleeping

App Files Files Community

prithivMLmods commited on 28 days ago

Commit

4c21b52

verified ·

1 Parent(s): 678e058

update app

Browse files

Files changed (1) hide show

app.py +121 -339

app.py CHANGED Viewed

@@ -1,7 +1,6 @@
 import os
 import gc
 import json
-import uuid
 import time
 import base64
 from io import BytesIO
@@ -10,12 +9,10 @@ from threading import Thread
 import gradio as gr
 import spaces
 import torch
-import numpy as np
 from PIL import Image
 import cv2
 from transformers import (
-    Qwen2VLForConditionalGeneration,
     Qwen2_5_VLForConditionalGeneration,
     AutoProcessor,
     TextIteratorStreamer,
@@ -74,17 +71,10 @@ MODEL_MAP = {
 MODEL_CHOICES = list(MODEL_MAP.keys())
 image_examples = [
-    {"query": "Perform OCR on the text in the image.", "media": "images/1.jpg", "model": "docscopeOCR-7B-050425-exp", "mode": "image"},
-    {"query": "Explain the scene in detail.", "media": "images/2.jpg", "model": "Cosmos-Reason1-7B", "mode": "image"},
 ]
-video_examples = [
-    {"query": "Explain the Ad in Detail", "media": "videos/1.mp4", "model": "Captioner-7B-Qwen2.5VL", "mode": "video"},
-    {"query": "Identify the main actions in the video", "media": "videos/2.mp4", "model": "visionOCR-3B", "mode": "video"},
-]
-all_examples = image_examples + video_examples
 def pil_to_data_url(img: Image.Image, fmt="PNG"):
     buf = BytesIO()
@@ -103,27 +93,15 @@ def file_to_data_url(path):
         "jpeg": "image/jpeg",
         "png": "image/png",
         "webp": "image/webp",
-        "mp4": "video/mp4",
-        "mov": "video/quicktime",
-        "webm": "video/webm",
-    }.get(ext, "application/octet-stream")
     with open(path, "rb") as f:
         data = base64.b64encode(f.read()).decode()
     return f"data:{mime};base64,{data}"
-def make_thumb_b64(path, mode="image", max_dim=240):
     try:
-        if mode == "video":
-            cap = cv2.VideoCapture(path)
-            ok, frame = cap.read()
-            cap.release()
-            if not ok:
-                return ""
-            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
-            img = Image.fromarray(frame).convert("RGB")
-        else:
-            img = Image.open(path).convert("RGB")
         img.thumbnail((max_dim, max_dim))
         return pil_to_data_url(img, "JPEG")
     except Exception as e:
@@ -133,15 +111,14 @@ def make_thumb_b64(path, mode="image", max_dim=240):
 def build_example_cards_html():
     cards = ""
-    for i, ex in enumerate(all_examples):
-        thumb = make_thumb_b64(ex["media"], ex["mode"])
         prompt_short = ex["query"][:72] + ("..." if len(ex["query"]) > 72 else "")
-        media_badge = "VIDEO" if ex["mode"] == "video" else "IMAGE"
         cards += f"""
         <div class="example-card" data-idx="{i}">
             <div class="example-thumb-wrap">
                 {"<img src='" + thumb + "' alt=''>" if thumb else "<div class='example-thumb-placeholder'>Preview</div>"}
-                <div class="example-media-chip">{media_badge}</div>
             </div>
             <div class="example-meta-row">
                 <span class="example-badge">{ex["model"]}</span>
@@ -160,18 +137,17 @@ def load_example_data(idx_str):
         idx = int(float(idx_str))
     except Exception:
         return json.dumps({"status": "error", "message": "Invalid example index"})
-    if idx < 0 or idx >= len(all_examples):
         return json.dumps({"status": "error", "message": "Example index out of range"})
-    ex = all_examples[idx]
     media_b64 = file_to_data_url(ex["media"])
     if not media_b64:
-        return json.dumps({"status": "error", "message": f"Could not load example {ex['mode']}"})
     return json.dumps({
         "status": "ok",
         "query": ex["query"],
         "media": media_b64,
         "model": ex["model"],
-        "mode": ex["mode"],
         "name": os.path.basename(ex["media"]),
     })
@@ -190,54 +166,6 @@ def b64_to_pil(b64_str):
         return None
-def b64_to_temp_video(b64_str):
-    if not b64_str:
-        return None
-    try:
-        if b64_str.startswith("data:"):
-            header, data = b64_str.split(",", 1)
-            mime = header.split(";")[0].replace("data:", "")
-        else:
-            data = b64_str
-            mime = "video/mp4"
-        ext = {
-            "video/mp4": ".mp4",
-            "video/webm": ".webm",
-            "video/quicktime": ".mov",
-        }.get(mime, ".mp4")
-        raw = base64.b64decode(data)
-        temp_dir = os.path.join("/tmp", "docscope_r1_media")
-        os.makedirs(temp_dir, exist_ok=True)
-        path = os.path.join(temp_dir, f"{uuid.uuid4().hex}{ext}")
-        with open(path, "wb") as f:
-            f.write(raw)
-        return path
-    except Exception:
-        return None
-def downsample_video(video_path):
-    vidcap = cv2.VideoCapture(video_path)
-    total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
-    fps = vidcap.get(cv2.CAP_PROP_FPS) or 1.0
-    frames = []
-    frame_count = min(total_frames, 10) if total_frames > 0 else 0
-    if frame_count == 0:
-        vidcap.release()
-        return frames
-    frame_indices = np.linspace(0, total_frames - 1, frame_count, dtype=int)
-    for i in frame_indices:
-        vidcap.set(cv2.CAP_PROP_POS_FRAMES, int(i))
-        success, image = vidcap.read()
-        if success:
-            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
-            pil_image = Image.fromarray(image)
-            timestamp = round(float(i) / float(fps), 2)
-            frames.append((pil_image, timestamp))
-    vidcap.release()
-    return frames
 def calc_timeout_image(model_name, text, image, max_new_tokens, temperature, top_p, top_k, repetition_penalty, gpu_timeout):
     try:
         return int(gpu_timeout)
@@ -245,13 +173,6 @@ def calc_timeout_image(model_name, text, image, max_new_tokens, temperature, top
         return 60
-def calc_timeout_video(model_name, text, video_path, max_new_tokens, temperature, top_p, top_k, repetition_penalty, gpu_timeout):
-    try:
-        return int(gpu_timeout)
-    except Exception:
-        return 60
 @spaces.GPU(duration=calc_timeout_image)
 def generate_image(model_name, text, image, max_new_tokens=1024, temperature=0.6, top_p=0.9, top_k=50, repetition_penalty=1.2, gpu_timeout=60):
     if not model_name or model_name not in MODEL_MAP:
@@ -314,102 +235,19 @@ def generate_image(model_name, text, image, max_new_tokens=1024, temperature=0.6
         torch.cuda.empty_cache()
-@spaces.GPU(duration=calc_timeout_video)
-def generate_video(model_name, text, video_path, max_new_tokens=1024, temperature=0.6, top_p=0.9, top_k=50, repetition_penalty=1.2, gpu_timeout=90):
-    if not model_name or model_name not in MODEL_MAP:
-        raise gr.Error("Please select a valid model.")
-    if not video_path:
-        raise gr.Error("Please upload a video.")
-    if not text or not str(text).strip():
-        raise gr.Error("Please enter your instruction.")
-    if len(str(text)) > MAX_INPUT_TOKEN_LENGTH * 8:
-        raise gr.Error("Query is too long. Please shorten your input.")
-    processor, model = MODEL_MAP[model_name]
-    frames = downsample_video(video_path)
-    if not frames:
-        raise gr.Error("Could not read the uploaded video.")
-    messages = [
-        {"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]},
-        {"role": "user", "content": [{"type": "text", "text": text}]}
-    ]
-    for image, timestamp in frames:
-        messages[1]["content"].append({"type": "text", "text": f"Frame {timestamp}:"})
-        messages[1]["content"].append({"type": "image", "image": image})
-    inputs = processor.apply_chat_template(
-        messages,
-        tokenize=True,
-        add_generation_prompt=True,
-        return_dict=True,
-        return_tensors="pt",
-        truncation=True,
-        max_length=MAX_INPUT_TOKEN_LENGTH
-    ).to(device)
-    streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
-    generation_kwargs = {
-        **inputs,
-        "streamer": streamer,
-        "max_new_tokens": int(max_new_tokens),
-        "do_sample": True,
-        "temperature": float(temperature),
-        "top_p": float(top_p),
-        "top_k": int(top_k),
-        "repetition_penalty": float(repetition_penalty),
-    }
-    thread = Thread(target=model.generate, kwargs=generation_kwargs)
-    thread.start()
-    buffer = ""
-    for new_text in streamer:
-        buffer += new_text.replace("<|im_end|>", "")
-        time.sleep(0.01)
-        yield buffer
-    gc.collect()
-    if torch.cuda.is_available():
-        torch.cuda.empty_cache()
-def run_inference(mode, model_name, text, image_b64, video_b64, max_new_tokens_v, temperature_v, top_p_v, top_k_v, repetition_penalty_v, gpu_timeout_v):
-    if mode == "video":
-        temp_video_path = b64_to_temp_video(video_b64)
-        if not temp_video_path:
-            raise gr.Error("Could not decode uploaded video.")
-        try:
-            yield from generate_video(
-                model_name=model_name,
-                text=text,
-                video_path=temp_video_path,
-                max_new_tokens=max_new_tokens_v,
-                temperature=temperature_v,
-                top_p=top_p_v,
-                top_k=top_k_v,
-                repetition_penalty=repetition_penalty_v,
-                gpu_timeout=gpu_timeout_v,
-            )
-        finally:
-            try:
-                os.remove(temp_video_path)
-            except Exception:
-                pass
-    else:
-        image = b64_to_pil(image_b64)
-        yield from generate_image(
-            model_name=model_name,
-            text=text,
-            image=image,
-            max_new_tokens=max_new_tokens_v,
-            temperature=temperature_v,
-            top_p=top_p_v,
-            top_k=top_k_v,
-            repetition_penalty=repetition_penalty_v,
-            gpu_timeout=gpu_timeout_v,
-        )
 def noop():
@@ -475,19 +313,6 @@ footer{display:none!important}
 .model-tab.active{background:rgba(255,20,147,.22);border-color:#FF1493;color:#fff!important;box-shadow:0 0 0 2px rgba(255,20,147,.10)}
 .model-tab-label{font-size:12px;color:#ffffff!important;font-weight:600}
-.mode-tabs-bar{
-    background:#18181b;border-bottom:1px solid #27272a;padding:10px 16px 12px;
-    display:flex;gap:8px;align-items:center;flex-wrap:wrap;
-}
-.mode-tab{
-    display:inline-flex;align-items:center;justify-content:center;gap:6px;
-    min-width:110px;height:34px;background:transparent;border:1px solid #27272a;
-    border-radius:999px;cursor:pointer;font-size:12px;font-weight:700;padding:0 14px;
-    color:#ffffff!important;transition:all .15s ease;text-transform:uppercase;letter-spacing:.5px;
-}
-.mode-tab:hover{background:rgba(255,20,147,.12);border-color:rgba(255,20,147,.35)}
-.mode-tab.active{background:rgba(255,20,147,.22);border-color:#FF1493;color:#fff!important;box-shadow:0 0 0 2px rgba(255,20,147,.10)}
 .app-main-row{display:flex;gap:0;flex:1;overflow:hidden}
 .app-main-left{flex:1;display:flex;flex-direction:column;min-width:0;border-right:1px solid #27272a}
 .app-main-right{width:470px;display:flex;flex-direction:column;flex-shrink:0;background:#18181b}
@@ -523,7 +348,7 @@ footer{display:none!important}
     overflow:hidden;border:1px solid #27272a;background:#111114;
     display:flex;align-items:center;justify-content:center;position:relative;
 }
-.single-preview-card img,.single-preview-card video{
     width:100%;height:100%;max-width:100%;max-height:100%;
     object-fit:contain;display:block;background:#000;
 }
@@ -757,24 +582,23 @@ function init() {
     const fileInput = document.getElementById('custom-file-input');
     const previewWrap = document.getElementById('single-preview-wrap');
     const previewImg = document.getElementById('single-preview-img');
-    const previewVideo = document.getElementById('single-preview-video');
     const btnUpload = document.getElementById('preview-upload-btn');
     const btnClear = document.getElementById('preview-clear-btn');
     const promptInput = document.getElementById('custom-query-input');
     const runBtnEl = document.getElementById('custom-run-btn');
     const outputArea = document.getElementById('custom-output-textarea');
     const mediaStatus = document.getElementById('sb-media-status');
-    const exampleResultContainer = document.getElementById('example-result-data');
-    if (!dropZone || !fileInput || !promptInput || !previewWrap || !previewImg || !previewVideo) {
         setTimeout(init, 250);
         return;
     }
     window.__docScopeInitDone = true;
     let mediaState = null;
-    let currentMode = 'image';
     let toastTimer = null;
     function showToast(message, type) {
         let toast = document.getElementById('app-toast');
@@ -826,6 +650,13 @@ function init() {
         setTimeout(() => outputArea.classList.remove('error-flash'), 800);
     }
     function setGradioValue(containerId, value) {
         const container = document.getElementById(containerId);
         if (!container) return;
@@ -841,10 +672,9 @@ function init() {
         });
     }
-    function syncMediaToGradio() {
-        setGradioValue('hidden-image-b64', mediaState && mediaState.mode === 'image' ? mediaState.b64 : '');
-        setGradioValue('hidden-video-b64', mediaState && mediaState.mode === 'video' ? mediaState.b64 : '');
-        const txt = mediaState ? (`1 ${mediaState.mode} uploaded`) : `No ${currentMode} uploaded`;
         if (mediaStatus) mediaStatus.textContent = txt;
     }
@@ -856,43 +686,25 @@ function init() {
         setGradioValue('hidden-model-name', name);
     }
-    function syncModeToGradio(mode) {
-        setGradioValue('hidden-mode-name', mode);
-    }
     function renderPreview() {
         if (!mediaState) {
             previewImg.src = '';
-            previewVideo.src = '';
             previewImg.style.display = 'none';
-            previewVideo.style.display = 'none';
             previewWrap.style.display = 'none';
             if (uploadPrompt) uploadPrompt.style.display = 'flex';
-            syncMediaToGradio();
             return;
         }
-        if (mediaState.mode === 'video') {
-            previewImg.src = '';
-            previewImg.style.display = 'none';
-            previewVideo.src = mediaState.b64;
-            previewVideo.style.display = 'block';
-            previewWrap.style.display = 'flex';
-        } else {
-            previewVideo.pause();
-            previewVideo.removeAttribute('src');
-            previewVideo.load();
-            previewVideo.style.display = 'none';
-            previewImg.src = mediaState.b64;
-            previewImg.style.display = 'block';
-            previewWrap.style.display = 'flex';
-        }
         if (uploadPrompt) uploadPrompt.style.display = 'none';
-        syncMediaToGradio();
     }
-    function setPreview(b64, name, mode) {
-        mediaState = {b64, name: name || 'file', mode: mode || currentMode};
         renderPreview();
     }
     window.__setPreview = setPreview;
@@ -905,40 +717,25 @@ function init() {
     function processFile(file) {
         if (!file) return;
-        if (currentMode === 'image' && !file.type.startsWith('image/')) {
-            showToast('Only image files are supported in Image mode', 'error');
-            return;
-        }
-        if (currentMode === 'video' && !file.type.startsWith('video/')) {
-            showToast('Only video files are supported in Video mode', 'error');
             return;
         }
         const reader = new FileReader();
-        reader.onload = (e) => setPreview(e.target.result, file.name, currentMode);
         reader.readAsDataURL(file);
     }
     fileInput.addEventListener('change', (e) => {
         const file = e.target.files && e.target.files[0] ? e.target.files[0] : null;
         if (file) processFile(file);
         e.target.value = '';
     });
-    function updateAccept() {
-        fileInput.accept = currentMode === 'video' ? 'video/*' : 'image/*';
-        const main = document.getElementById('upload-main-text');
-        const sub = document.getElementById('upload-sub-text');
-        if (main) main.textContent = currentMode === 'video' ? 'Click or drag a video here' : 'Click or drag an image here';
-        if (sub) sub.textContent = currentMode === 'video'
-            ? 'Upload one short video clip for document-aware video understanding'
-            : 'Upload one document, page, screenshot, receipt, or scene image for OCR and reasoning';
-        if (!mediaState && mediaStatus) mediaStatus.textContent = `No ${currentMode} uploaded`;
-    }
-    if (uploadClick) uploadClick.addEventListener('click', () => fileInput.click());
-    if (btnUpload) btnUpload.addEventListener('click', () => fileInput.click());
-    if (btnClear) btnClear.addEventListener('click', clearPreview);
     dropZone.addEventListener('dragover', (e) => {
         e.preventDefault();
         dropZone.classList.add('drag-over');
@@ -963,26 +760,11 @@ function init() {
     }
     window.__activateModelTab = activateModelTab;
-    function activateModeTab(mode) {
-        currentMode = mode;
-        document.querySelectorAll('.mode-tab[data-mode]').forEach(btn => {
-            btn.classList.toggle('active', btn.getAttribute('data-mode') === mode);
-        });
-        syncModeToGradio(mode);
-        updateAccept();
-        if (mediaState && mediaState.mode !== mode) clearPreview();
-    }
-    window.__activateModeTab = activateModeTab;
     document.querySelectorAll('.model-tab[data-model]').forEach(btn => {
         btn.addEventListener('click', () => activateModelTab(btn.getAttribute('data-model')));
     });
-    document.querySelectorAll('.mode-tab[data-mode]').forEach(btn => {
-        btn.addEventListener('click', () => activateModeTab(btn.getAttribute('data-mode')));
-    });
     activateModelTab('Cosmos-Reason1-7B');
-    activateModeTab('image');
     function syncSlider(customId, gradioId) {
         const slider = document.getElementById(customId);
@@ -1013,16 +795,12 @@ function init() {
     function validateBeforeRun() {
         const promptVal = promptInput.value.trim();
         if (!mediaState && !promptVal) {
-            showToast(`Please upload a ${currentMode} and enter your instruction`, 'error');
             flashPromptError();
             return false;
         }
         if (!mediaState) {
-            showToast(`Please upload a ${currentMode}`, 'error');
-            return false;
-        }
-        if (mediaState.mode !== currentMode) {
-            showToast(`Uploaded media does not match ${currentMode} mode`, 'error');
             return false;
         }
         if (!promptVal) {
@@ -1041,11 +819,9 @@ function init() {
     window.__clickGradioRunBtn = function() {
         if (!validateBeforeRun()) return;
         syncPromptToGradio();
-        syncMediaToGradio();
         const activeModel = document.querySelector('.model-tab.active');
         if (activeModel) syncModelToGradio(activeModel.getAttribute('data-model'));
-        const activeMode = document.querySelector('.mode-tab.active');
-        if (activeMode) syncModeToGradio(activeMode.getAttribute('data-mode'));
         if (outputArea) outputArea.value = '';
         showLoader();
         setTimeout(() => {
@@ -1099,55 +875,86 @@ function init() {
         });
     }
     document.querySelectorAll('.example-card[data-idx]').forEach(card => {
         card.addEventListener('click', () => {
             const idx = card.getAttribute('data-idx');
             document.querySelectorAll('.example-card.loading').forEach(c => c.classList.remove('loading'));
             card.classList.add('loading');
             showToast('Loading example...', 'info');
             setGradioValue('example-result-data', '');
             setGradioValue('example-idx-input', idx);
             setTimeout(() => {
                 const btn = document.getElementById('example-load-btn');
                 if (btn) {
                     const b = btn.querySelector('button');
                     if (b) b.click(); else btn.click();
                 }
-            }, 150);
-            setTimeout(() => card.classList.remove('loading'), 12000);
         });
     });
-    function checkExampleResult() {
-        if (!exampleResultContainer) return;
-        const el = exampleResultContainer.querySelector('textarea') || exampleResultContainer.querySelector('input');
-        if (!el || !el.value) return;
-        if (window.__lastExampleVal === el.value) return;
-        try {
-            const data = JSON.parse(el.value);
-            if (data.status === 'ok') {
-                window.__lastExampleVal = el.value;
-                if (data.mode) activateModeTab(data.mode);
-                if (data.media) setPreview(data.media, data.name || 'example', data.mode || 'image');
-                if (data.query) {
-                    promptInput.value = data.query;
-                    syncPromptToGradio();
                 }
-                if (data.model) activateModelTab(data.model);
-                document.querySelectorAll('.example-card.loading').forEach(c => c.classList.remove('loading'));
-                showToast('Example loaded', 'info');
-            } else if (data.status === 'error') {
-                document.querySelectorAll('.example-card.loading').forEach(c => c.classList.remove('loading'));
-                showToast(data.message || 'Failed to load example', 'error');
             }
-        } catch(e) {}
-    }
-    const obsExample = new MutationObserver(checkExampleResult);
-    if (exampleResultContainer) {
-        obsExample.observe(exampleResultContainer, {childList:true, subtree:true, characterData:true, attributes:true});
     }
-    setInterval(checkExampleResult, 500);
     if (outputArea) outputArea.value = '';
     const sb = document.getElementById('sb-run-state');
@@ -1210,15 +1017,8 @@ MODEL_TABS_HTML = "".join([
     for m in MODEL_CHOICES
 ])
-MODE_TABS_HTML = """
-<button class="mode-tab active" data-mode="image">Image Inference</button>
-<button class="mode-tab" data-mode="video">Video Inference</button>
-"""
 with gr.Blocks() as demo:
-    hidden_mode_name = gr.Textbox(value="image", elem_id="hidden-mode-name", elem_classes="hidden-input", container=False)
     hidden_image_b64 = gr.Textbox(value="", elem_id="hidden-image-b64", elem_classes="hidden-input", container=False)
-    hidden_video_b64 = gr.Textbox(value="", elem_id="hidden-video-b64", elem_classes="hidden-input", container=False)
     prompt = gr.Textbox(value="", elem_id="prompt-gradio-input", elem_classes="hidden-input", container=False)
     hidden_model_name = gr.Textbox(value="Cosmos-Reason1-7B", elem_id="hidden-model-name", elem_classes="hidden-input", container=False)
@@ -1250,10 +1050,6 @@ with gr.Blocks() as demo:
             {MODEL_TABS_HTML}
         </div>
-        <div class="mode-tabs-bar">
-            {MODE_TABS_HTML}
-        </div>
         <div class="app-main-row">
             <div class="app-main-left">
                 <div id="media-drop-zone">
@@ -1270,7 +1066,6 @@ with gr.Blocks() as demo:
                     <div id="single-preview-wrap" class="single-preview-wrap">
                         <div class="single-preview-card">
                             <img id="single-preview-img" src="" alt="Preview" style="display:none;">
-                            <video id="single-preview-video" controls playsinline style="display:none;"></video>
                             <div class="preview-overlay-actions">
                                 <button id="preview-upload-btn" class="preview-action-btn" title="Replace">Upload</button>
                                 <button id="preview-clear-btn" class="preview-action-btn" title="Clear">Clear</button>
@@ -1280,10 +1075,9 @@ with gr.Blocks() as demo:
                 </div>
                 <div class="hint-bar">
-                    <b>Upload:</b> Click or drag media into the panel &nbsp;&middot;&nbsp;
-                    <b>Mode:</b> Switch between image and video inference &nbsp;&middot;&nbsp;
                     <b>Model:</b> Change models from the header &nbsp;&middot;&nbsp;
-                    <kbd>Clear</kbd> removes the current media
                 </div>
                 <div class="examples-section">
@@ -1299,7 +1093,7 @@ with gr.Blocks() as demo:
                     <div class="panel-card-title">Vision / OCR Instruction</div>
                     <div class="panel-card-body">
                         <label class="modern-label" for="custom-query-input">Query Input</label>
-                        <textarea id="custom-query-input" class="modern-textarea" rows="4" placeholder="e.g., perform OCR on this image, describe the document, explain the ad, summarize the video, identify visible text, analyze the scene..."></textarea>
                     </div>
                 </div>
@@ -1386,11 +1180,9 @@ with gr.Blocks() as demo:
     run_btn.click(
         fn=run_inference,
         inputs=[
-            hidden_mode_name,
             hidden_model_name,
             prompt,
             hidden_image_b64,
-            hidden_video_b64,
             max_new_tokens,
             temperature,
             top_p,
@@ -1399,30 +1191,20 @@ with gr.Blocks() as demo:
             gpu_duration_state,
         ],
         outputs=[result],
-        js=r"""(mode, model, p, img, vid, mnt, t, tp, tk, rp, gd) => {
             const modelEl = document.querySelector('.model-tab.active');
-            const modeEl = document.querySelector('.mode-tab.active');
             const modelVal = modelEl ? modelEl.getAttribute('data-model') : model;
-            const modeVal = modeEl ? modeEl.getAttribute('data-mode') : mode;
             const promptEl = document.getElementById('custom-query-input');
             const promptVal = promptEl ? promptEl.value : p;
             let imgVal = img;
-            let vidVal = vid;
             const imgContainer = document.getElementById('hidden-image-b64');
-            const vidContainer = document.getElementById('hidden-video-b64');
             if (imgContainer) {
                 const inner = imgContainer.querySelector('textarea, input');
                 if (inner) imgVal = inner.value;
             }
-            if (vidContainer) {
-                const inner = vidContainer.querySelector('textarea, input');
-                if (inner) vidVal = inner.value;
-            }
-            return [modeVal, modelVal, promptVal, imgVal, vidVal, mnt, t, tp, tk, rp, gd];
         }""",
     )
@@ -1439,5 +1221,5 @@ if __name__ == "__main__":
         mcp_server=True,
         ssr_mode=False,
         show_error=True,
-        allowed_paths=["images", "videos"],
     )

 import os
 import gc
 import json
 import time
 import base64
 from io import BytesIO
 import gradio as gr
 import spaces
 import torch
 from PIL import Image
 import cv2
 from transformers import (
     Qwen2_5_VLForConditionalGeneration,
     AutoProcessor,
     TextIteratorStreamer,
 MODEL_CHOICES = list(MODEL_MAP.keys())
 image_examples = [
+    {"query": "Perform OCR on the text in the image.", "media": "images/1.jpg", "model": "docscopeOCR-7B-050425-exp"},
+    {"query": "Explain the scene in detail.", "media": "images/2.jpg", "model": "Cosmos-Reason1-7B"},
 ]
 def pil_to_data_url(img: Image.Image, fmt="PNG"):
     buf = BytesIO()
         "jpeg": "image/jpeg",
         "png": "image/png",
         "webp": "image/webp",
+    }.get(ext, "image/jpeg")
     with open(path, "rb") as f:
         data = base64.b64encode(f.read()).decode()
     return f"data:{mime};base64,{data}"
+def make_thumb_b64(path, max_dim=240):
     try:
+        img = Image.open(path).convert("RGB")
         img.thumbnail((max_dim, max_dim))
         return pil_to_data_url(img, "JPEG")
     except Exception as e:
 def build_example_cards_html():
     cards = ""
+    for i, ex in enumerate(image_examples):
+        thumb = make_thumb_b64(ex["media"])
         prompt_short = ex["query"][:72] + ("..." if len(ex["query"]) > 72 else "")
         cards += f"""
         <div class="example-card" data-idx="{i}">
             <div class="example-thumb-wrap">
                 {"<img src='" + thumb + "' alt=''>" if thumb else "<div class='example-thumb-placeholder'>Preview</div>"}
+                <div class="example-media-chip">IMAGE</div>
             </div>
             <div class="example-meta-row">
                 <span class="example-badge">{ex["model"]}</span>
         idx = int(float(idx_str))
     except Exception:
         return json.dumps({"status": "error", "message": "Invalid example index"})
+    if idx < 0 or idx >= len(image_examples):
         return json.dumps({"status": "error", "message": "Example index out of range"})
+    ex = image_examples[idx]
     media_b64 = file_to_data_url(ex["media"])
     if not media_b64:
+        return json.dumps({"status": "error", "message": "Could not load example image"})
     return json.dumps({
         "status": "ok",
         "query": ex["query"],
         "media": media_b64,
         "model": ex["model"],
         "name": os.path.basename(ex["media"]),
     })
         return None
 def calc_timeout_image(model_name, text, image, max_new_tokens, temperature, top_p, top_k, repetition_penalty, gpu_timeout):
     try:
         return int(gpu_timeout)
         return 60
 @spaces.GPU(duration=calc_timeout_image)
 def generate_image(model_name, text, image, max_new_tokens=1024, temperature=0.6, top_p=0.9, top_k=50, repetition_penalty=1.2, gpu_timeout=60):
     if not model_name or model_name not in MODEL_MAP:
         torch.cuda.empty_cache()
+def run_inference(model_name, text, image_b64, max_new_tokens_v, temperature_v, top_p_v, top_k_v, repetition_penalty_v, gpu_timeout_v):
+    image = b64_to_pil(image_b64)
+    yield from generate_image(
+        model_name=model_name,
+        text=text,
+        image=image,
+        max_new_tokens=max_new_tokens_v,
+        temperature=temperature_v,
+        top_p=top_p_v,
+        top_k=top_k_v,
+        repetition_penalty=repetition_penalty_v,
+        gpu_timeout=gpu_timeout_v,
+    )
 def noop():
 .model-tab.active{background:rgba(255,20,147,.22);border-color:#FF1493;color:#fff!important;box-shadow:0 0 0 2px rgba(255,20,147,.10)}
 .model-tab-label{font-size:12px;color:#ffffff!important;font-weight:600}
 .app-main-row{display:flex;gap:0;flex:1;overflow:hidden}
 .app-main-left{flex:1;display:flex;flex-direction:column;min-width:0;border-right:1px solid #27272a}
 .app-main-right{width:470px;display:flex;flex-direction:column;flex-shrink:0;background:#18181b}
     overflow:hidden;border:1px solid #27272a;background:#111114;
     display:flex;align-items:center;justify-content:center;position:relative;
 }
+.single-preview-card img{
     width:100%;height:100%;max-width:100%;max-height:100%;
     object-fit:contain;display:block;background:#000;
 }
     const fileInput = document.getElementById('custom-file-input');
     const previewWrap = document.getElementById('single-preview-wrap');
     const previewImg = document.getElementById('single-preview-img');
     const btnUpload = document.getElementById('preview-upload-btn');
     const btnClear = document.getElementById('preview-clear-btn');
     const promptInput = document.getElementById('custom-query-input');
     const runBtnEl = document.getElementById('custom-run-btn');
     const outputArea = document.getElementById('custom-output-textarea');
     const mediaStatus = document.getElementById('sb-media-status');
+    if (!dropZone || !fileInput || !promptInput || !previewWrap || !previewImg) {
         setTimeout(init, 250);
         return;
     }
     window.__docScopeInitDone = true;
     let mediaState = null;
     let toastTimer = null;
+    let examplePoller = null;
+    let lastSeenExamplePayload = null;
     function showToast(message, type) {
         let toast = document.getElementById('app-toast');
         setTimeout(() => outputArea.classList.remove('error-flash'), 800);
     }
+    function getValueFromContainer(containerId) {
+        const container = document.getElementById(containerId);
+        if (!container) return '';
+        const el = container.querySelector('textarea, input');
+        return el ? (el.value || '') : '';
+    }
     function setGradioValue(containerId, value) {
         const container = document.getElementById(containerId);
         if (!container) return;
         });
     }
+    function syncImageToGradio() {
+        setGradioValue('hidden-image-b64', mediaState ? mediaState.b64 : '');
+        const txt = mediaState ? '1 image uploaded' : 'No image uploaded';
         if (mediaStatus) mediaStatus.textContent = txt;
     }
         setGradioValue('hidden-model-name', name);
     }
     function renderPreview() {
         if (!mediaState) {
             previewImg.src = '';
             previewImg.style.display = 'none';
             previewWrap.style.display = 'none';
             if (uploadPrompt) uploadPrompt.style.display = 'flex';
+            syncImageToGradio();
             return;
         }
+        previewImg.src = mediaState.b64;
+        previewImg.style.display = 'block';
+        previewWrap.style.display = 'flex';
         if (uploadPrompt) uploadPrompt.style.display = 'none';
+        syncImageToGradio();
     }
+    function setPreview(b64, name) {
+        mediaState = {b64, name: name || 'file'};
         renderPreview();
     }
     window.__setPreview = setPreview;
     function processFile(file) {
         if (!file) return;
+        if (!file.type.startsWith('image/')) {
+            showToast('Only image files are supported', 'error');
             return;
         }
         const reader = new FileReader();
+        reader.onload = (e) => setPreview(e.target.result, file.name);
         reader.readAsDataURL(file);
     }
+    if (uploadClick) uploadClick.addEventListener('click', () => fileInput.click());
+    if (btnUpload) btnUpload.addEventListener('click', () => fileInput.click());
+    if (btnClear) btnClear.addEventListener('click', clearPreview);
     fileInput.addEventListener('change', (e) => {
         const file = e.target.files && e.target.files[0] ? e.target.files[0] : null;
         if (file) processFile(file);
         e.target.value = '';
     });
     dropZone.addEventListener('dragover', (e) => {
         e.preventDefault();
         dropZone.classList.add('drag-over');
     }
     window.__activateModelTab = activateModelTab;
     document.querySelectorAll('.model-tab[data-model]').forEach(btn => {
         btn.addEventListener('click', () => activateModelTab(btn.getAttribute('data-model')));
     });
     activateModelTab('Cosmos-Reason1-7B');
     function syncSlider(customId, gradioId) {
         const slider = document.getElementById(customId);
     function validateBeforeRun() {
         const promptVal = promptInput.value.trim();
         if (!mediaState && !promptVal) {
+            showToast('Please upload an image and enter your instruction', 'error');
             flashPromptError();
             return false;
         }
         if (!mediaState) {
+            showToast('Please upload an image', 'error');
             return false;
         }
         if (!promptVal) {
     window.__clickGradioRunBtn = function() {
         if (!validateBeforeRun()) return;
         syncPromptToGradio();
+        syncImageToGradio();
         const activeModel = document.querySelector('.model-tab.active');
         if (activeModel) syncModelToGradio(activeModel.getAttribute('data-model'));
         if (outputArea) outputArea.value = '';
         showLoader();
         setTimeout(() => {
         });
     }
+    function applyExamplePayload(raw) {
+        try {
+            const data = JSON.parse(raw);
+            if (data.status === 'ok') {
+                if (data.media) setPreview(data.media, data.name || 'example_file');
+                if (data.query) {
+                    promptInput.value = data.query;
+                    syncPromptToGradio();
+                }
+                if (data.model) activateModelTab(data.model);
+                document.querySelectorAll('.example-card.loading').forEach(c => c.classList.remove('loading'));
+                showToast('Example loaded', 'info');
+            } else if (data.status === 'error') {
+                document.querySelectorAll('.example-card.loading').forEach(c => c.classList.remove('loading'));
+                showToast(data.message || 'Failed to load example', 'error');
+            }
+        } catch (e) {
+            document.querySelectorAll('.example-card.loading').forEach(c => c.classList.remove('loading'));
+            showToast('Failed to parse example data', 'error');
+        }
+    }
+    function startExamplePolling() {
+        if (examplePoller) clearInterval(examplePoller);
+        let attempts = 0;
+        examplePoller = setInterval(() => {
+            attempts += 1;
+            const current = getValueFromContainer('example-result-data');
+            if (current && current !== lastSeenExamplePayload) {
+                lastSeenExamplePayload = current;
+                clearInterval(examplePoller);
+                examplePoller = null;
+                applyExamplePayload(current);
+                return;
+            }
+            if (attempts >= 80) {
+                clearInterval(examplePoller);
+                examplePoller = null;
+                document.querySelectorAll('.example-card.loading').forEach(c => c.classList.remove('loading'));
+                showToast('Example load timed out', 'error');
+            }
+        }, 150);
+    }
     document.querySelectorAll('.example-card[data-idx]').forEach(card => {
         card.addEventListener('click', () => {
             const idx = card.getAttribute('data-idx');
             document.querySelectorAll('.example-card.loading').forEach(c => c.classList.remove('loading'));
             card.classList.add('loading');
             showToast('Loading example...', 'info');
             setGradioValue('example-result-data', '');
             setGradioValue('example-idx-input', idx);
             setTimeout(() => {
                 const btn = document.getElementById('example-load-btn');
                 if (btn) {
                     const b = btn.querySelector('button');
                     if (b) b.click(); else btn.click();
                 }
+                startExamplePolling();
+            }, 220);
         });
     });
+    const observerTarget = document.getElementById('example-result-data');
+    if (observerTarget) {
+        const obs = new MutationObserver(() => {
+            const current = getValueFromContainer('example-result-data');
+            if (current && current !== lastSeenExamplePayload) {
+                lastSeenExamplePayload = current;
+                if (examplePoller) {
+                    clearInterval(examplePoller);
+                    examplePoller = null;
                 }
+                applyExamplePayload(current);
             }
+        });
+        obs.observe(observerTarget, {childList:true, subtree:true, characterData:true, attributes:true});
     }
     if (outputArea) outputArea.value = '';
     const sb = document.getElementById('sb-run-state');
     for m in MODEL_CHOICES
 ])
 with gr.Blocks() as demo:
     hidden_image_b64 = gr.Textbox(value="", elem_id="hidden-image-b64", elem_classes="hidden-input", container=False)
     prompt = gr.Textbox(value="", elem_id="prompt-gradio-input", elem_classes="hidden-input", container=False)
     hidden_model_name = gr.Textbox(value="Cosmos-Reason1-7B", elem_id="hidden-model-name", elem_classes="hidden-input", container=False)
             {MODEL_TABS_HTML}
         </div>
         <div class="app-main-row">
             <div class="app-main-left">
                 <div id="media-drop-zone">
                     <div id="single-preview-wrap" class="single-preview-wrap">
                         <div class="single-preview-card">
                             <img id="single-preview-img" src="" alt="Preview" style="display:none;">
                             <div class="preview-overlay-actions">
                                 <button id="preview-upload-btn" class="preview-action-btn" title="Replace">Upload</button>
                                 <button id="preview-clear-btn" class="preview-action-btn" title="Clear">Clear</button>
                 </div>
                 <div class="hint-bar">
+                    <b>Upload:</b> Click or drag an image into the panel &nbsp;&middot;&nbsp;
                     <b>Model:</b> Change models from the header &nbsp;&middot;&nbsp;
+                    <kbd>Clear</kbd> removes the current image
                 </div>
                 <div class="examples-section">
                     <div class="panel-card-title">Vision / OCR Instruction</div>
                     <div class="panel-card-body">
                         <label class="modern-label" for="custom-query-input">Query Input</label>
+                        <textarea id="custom-query-input" class="modern-textarea" rows="4" placeholder="e.g., perform OCR on this image, describe the document, identify visible text, analyze the scene..."></textarea>
                     </div>
                 </div>
     run_btn.click(
         fn=run_inference,
         inputs=[
             hidden_model_name,
             prompt,
             hidden_image_b64,
             max_new_tokens,
             temperature,
             top_p,
             gpu_duration_state,
         ],
         outputs=[result],
+        js=r"""(model, p, img, mnt, t, tp, tk, rp, gd) => {
             const modelEl = document.querySelector('.model-tab.active');
             const modelVal = modelEl ? modelEl.getAttribute('data-model') : model;
             const promptEl = document.getElementById('custom-query-input');
             const promptVal = promptEl ? promptEl.value : p;
             let imgVal = img;
             const imgContainer = document.getElementById('hidden-image-b64');
             if (imgContainer) {
                 const inner = imgContainer.querySelector('textarea, input');
                 if (inner) imgVal = inner.value;
             }
+            return [modelVal, promptVal, imgVal, mnt, t, tp, tk, rp, gd];
         }""",
     )
         mcp_server=True,
         ssr_mode=False,
         show_error=True,
+        allowed_paths=["images"],
     )