prithivMLmods commited on
Commit
4c21b52
·
verified ·
1 Parent(s): 678e058

update app

Browse files
Files changed (1) hide show
  1. app.py +121 -339
app.py CHANGED
@@ -1,7 +1,6 @@
1
  import os
2
  import gc
3
  import json
4
- import uuid
5
  import time
6
  import base64
7
  from io import BytesIO
@@ -10,12 +9,10 @@ from threading import Thread
10
  import gradio as gr
11
  import spaces
12
  import torch
13
- import numpy as np
14
  from PIL import Image
15
  import cv2
16
 
17
  from transformers import (
18
- Qwen2VLForConditionalGeneration,
19
  Qwen2_5_VLForConditionalGeneration,
20
  AutoProcessor,
21
  TextIteratorStreamer,
@@ -74,17 +71,10 @@ MODEL_MAP = {
74
  MODEL_CHOICES = list(MODEL_MAP.keys())
75
 
76
  image_examples = [
77
- {"query": "Perform OCR on the text in the image.", "media": "images/1.jpg", "model": "docscopeOCR-7B-050425-exp", "mode": "image"},
78
- {"query": "Explain the scene in detail.", "media": "images/2.jpg", "model": "Cosmos-Reason1-7B", "mode": "image"},
79
  ]
80
 
81
- video_examples = [
82
- {"query": "Explain the Ad in Detail", "media": "videos/1.mp4", "model": "Captioner-7B-Qwen2.5VL", "mode": "video"},
83
- {"query": "Identify the main actions in the video", "media": "videos/2.mp4", "model": "visionOCR-3B", "mode": "video"},
84
- ]
85
-
86
- all_examples = image_examples + video_examples
87
-
88
 
89
  def pil_to_data_url(img: Image.Image, fmt="PNG"):
90
  buf = BytesIO()
@@ -103,27 +93,15 @@ def file_to_data_url(path):
103
  "jpeg": "image/jpeg",
104
  "png": "image/png",
105
  "webp": "image/webp",
106
- "mp4": "video/mp4",
107
- "mov": "video/quicktime",
108
- "webm": "video/webm",
109
- }.get(ext, "application/octet-stream")
110
  with open(path, "rb") as f:
111
  data = base64.b64encode(f.read()).decode()
112
  return f"data:{mime};base64,{data}"
113
 
114
 
115
- def make_thumb_b64(path, mode="image", max_dim=240):
116
  try:
117
- if mode == "video":
118
- cap = cv2.VideoCapture(path)
119
- ok, frame = cap.read()
120
- cap.release()
121
- if not ok:
122
- return ""
123
- frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
124
- img = Image.fromarray(frame).convert("RGB")
125
- else:
126
- img = Image.open(path).convert("RGB")
127
  img.thumbnail((max_dim, max_dim))
128
  return pil_to_data_url(img, "JPEG")
129
  except Exception as e:
@@ -133,15 +111,14 @@ def make_thumb_b64(path, mode="image", max_dim=240):
133
 
134
  def build_example_cards_html():
135
  cards = ""
136
- for i, ex in enumerate(all_examples):
137
- thumb = make_thumb_b64(ex["media"], ex["mode"])
138
  prompt_short = ex["query"][:72] + ("..." if len(ex["query"]) > 72 else "")
139
- media_badge = "VIDEO" if ex["mode"] == "video" else "IMAGE"
140
  cards += f"""
141
  <div class="example-card" data-idx="{i}">
142
  <div class="example-thumb-wrap">
143
  {"<img src='" + thumb + "' alt=''>" if thumb else "<div class='example-thumb-placeholder'>Preview</div>"}
144
- <div class="example-media-chip">{media_badge}</div>
145
  </div>
146
  <div class="example-meta-row">
147
  <span class="example-badge">{ex["model"]}</span>
@@ -160,18 +137,17 @@ def load_example_data(idx_str):
160
  idx = int(float(idx_str))
161
  except Exception:
162
  return json.dumps({"status": "error", "message": "Invalid example index"})
163
- if idx < 0 or idx >= len(all_examples):
164
  return json.dumps({"status": "error", "message": "Example index out of range"})
165
- ex = all_examples[idx]
166
  media_b64 = file_to_data_url(ex["media"])
167
  if not media_b64:
168
- return json.dumps({"status": "error", "message": f"Could not load example {ex['mode']}"})
169
  return json.dumps({
170
  "status": "ok",
171
  "query": ex["query"],
172
  "media": media_b64,
173
  "model": ex["model"],
174
- "mode": ex["mode"],
175
  "name": os.path.basename(ex["media"]),
176
  })
177
 
@@ -190,54 +166,6 @@ def b64_to_pil(b64_str):
190
  return None
191
 
192
 
193
- def b64_to_temp_video(b64_str):
194
- if not b64_str:
195
- return None
196
- try:
197
- if b64_str.startswith("data:"):
198
- header, data = b64_str.split(",", 1)
199
- mime = header.split(";")[0].replace("data:", "")
200
- else:
201
- data = b64_str
202
- mime = "video/mp4"
203
- ext = {
204
- "video/mp4": ".mp4",
205
- "video/webm": ".webm",
206
- "video/quicktime": ".mov",
207
- }.get(mime, ".mp4")
208
- raw = base64.b64decode(data)
209
- temp_dir = os.path.join("/tmp", "docscope_r1_media")
210
- os.makedirs(temp_dir, exist_ok=True)
211
- path = os.path.join(temp_dir, f"{uuid.uuid4().hex}{ext}")
212
- with open(path, "wb") as f:
213
- f.write(raw)
214
- return path
215
- except Exception:
216
- return None
217
-
218
-
219
- def downsample_video(video_path):
220
- vidcap = cv2.VideoCapture(video_path)
221
- total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
222
- fps = vidcap.get(cv2.CAP_PROP_FPS) or 1.0
223
- frames = []
224
- frame_count = min(total_frames, 10) if total_frames > 0 else 0
225
- if frame_count == 0:
226
- vidcap.release()
227
- return frames
228
- frame_indices = np.linspace(0, total_frames - 1, frame_count, dtype=int)
229
- for i in frame_indices:
230
- vidcap.set(cv2.CAP_PROP_POS_FRAMES, int(i))
231
- success, image = vidcap.read()
232
- if success:
233
- image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
234
- pil_image = Image.fromarray(image)
235
- timestamp = round(float(i) / float(fps), 2)
236
- frames.append((pil_image, timestamp))
237
- vidcap.release()
238
- return frames
239
-
240
-
241
  def calc_timeout_image(model_name, text, image, max_new_tokens, temperature, top_p, top_k, repetition_penalty, gpu_timeout):
242
  try:
243
  return int(gpu_timeout)
@@ -245,13 +173,6 @@ def calc_timeout_image(model_name, text, image, max_new_tokens, temperature, top
245
  return 60
246
 
247
 
248
- def calc_timeout_video(model_name, text, video_path, max_new_tokens, temperature, top_p, top_k, repetition_penalty, gpu_timeout):
249
- try:
250
- return int(gpu_timeout)
251
- except Exception:
252
- return 60
253
-
254
-
255
  @spaces.GPU(duration=calc_timeout_image)
256
  def generate_image(model_name, text, image, max_new_tokens=1024, temperature=0.6, top_p=0.9, top_k=50, repetition_penalty=1.2, gpu_timeout=60):
257
  if not model_name or model_name not in MODEL_MAP:
@@ -314,102 +235,19 @@ def generate_image(model_name, text, image, max_new_tokens=1024, temperature=0.6
314
  torch.cuda.empty_cache()
315
 
316
 
317
- @spaces.GPU(duration=calc_timeout_video)
318
- def generate_video(model_name, text, video_path, max_new_tokens=1024, temperature=0.6, top_p=0.9, top_k=50, repetition_penalty=1.2, gpu_timeout=90):
319
- if not model_name or model_name not in MODEL_MAP:
320
- raise gr.Error("Please select a valid model.")
321
- if not video_path:
322
- raise gr.Error("Please upload a video.")
323
- if not text or not str(text).strip():
324
- raise gr.Error("Please enter your instruction.")
325
- if len(str(text)) > MAX_INPUT_TOKEN_LENGTH * 8:
326
- raise gr.Error("Query is too long. Please shorten your input.")
327
-
328
- processor, model = MODEL_MAP[model_name]
329
- frames = downsample_video(video_path)
330
- if not frames:
331
- raise gr.Error("Could not read the uploaded video.")
332
-
333
- messages = [
334
- {"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]},
335
- {"role": "user", "content": [{"type": "text", "text": text}]}
336
- ]
337
-
338
- for image, timestamp in frames:
339
- messages[1]["content"].append({"type": "text", "text": f"Frame {timestamp}:"})
340
- messages[1]["content"].append({"type": "image", "image": image})
341
-
342
- inputs = processor.apply_chat_template(
343
- messages,
344
- tokenize=True,
345
- add_generation_prompt=True,
346
- return_dict=True,
347
- return_tensors="pt",
348
- truncation=True,
349
- max_length=MAX_INPUT_TOKEN_LENGTH
350
- ).to(device)
351
-
352
- streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
353
- generation_kwargs = {
354
- **inputs,
355
- "streamer": streamer,
356
- "max_new_tokens": int(max_new_tokens),
357
- "do_sample": True,
358
- "temperature": float(temperature),
359
- "top_p": float(top_p),
360
- "top_k": int(top_k),
361
- "repetition_penalty": float(repetition_penalty),
362
- }
363
-
364
- thread = Thread(target=model.generate, kwargs=generation_kwargs)
365
- thread.start()
366
-
367
- buffer = ""
368
- for new_text in streamer:
369
- buffer += new_text.replace("<|im_end|>", "")
370
- time.sleep(0.01)
371
- yield buffer
372
-
373
- gc.collect()
374
- if torch.cuda.is_available():
375
- torch.cuda.empty_cache()
376
-
377
-
378
- def run_inference(mode, model_name, text, image_b64, video_b64, max_new_tokens_v, temperature_v, top_p_v, top_k_v, repetition_penalty_v, gpu_timeout_v):
379
- if mode == "video":
380
- temp_video_path = b64_to_temp_video(video_b64)
381
- if not temp_video_path:
382
- raise gr.Error("Could not decode uploaded video.")
383
- try:
384
- yield from generate_video(
385
- model_name=model_name,
386
- text=text,
387
- video_path=temp_video_path,
388
- max_new_tokens=max_new_tokens_v,
389
- temperature=temperature_v,
390
- top_p=top_p_v,
391
- top_k=top_k_v,
392
- repetition_penalty=repetition_penalty_v,
393
- gpu_timeout=gpu_timeout_v,
394
- )
395
- finally:
396
- try:
397
- os.remove(temp_video_path)
398
- except Exception:
399
- pass
400
- else:
401
- image = b64_to_pil(image_b64)
402
- yield from generate_image(
403
- model_name=model_name,
404
- text=text,
405
- image=image,
406
- max_new_tokens=max_new_tokens_v,
407
- temperature=temperature_v,
408
- top_p=top_p_v,
409
- top_k=top_k_v,
410
- repetition_penalty=repetition_penalty_v,
411
- gpu_timeout=gpu_timeout_v,
412
- )
413
 
414
 
415
  def noop():
@@ -475,19 +313,6 @@ footer{display:none!important}
475
  .model-tab.active{background:rgba(255,20,147,.22);border-color:#FF1493;color:#fff!important;box-shadow:0 0 0 2px rgba(255,20,147,.10)}
476
  .model-tab-label{font-size:12px;color:#ffffff!important;font-weight:600}
477
 
478
- .mode-tabs-bar{
479
- background:#18181b;border-bottom:1px solid #27272a;padding:10px 16px 12px;
480
- display:flex;gap:8px;align-items:center;flex-wrap:wrap;
481
- }
482
- .mode-tab{
483
- display:inline-flex;align-items:center;justify-content:center;gap:6px;
484
- min-width:110px;height:34px;background:transparent;border:1px solid #27272a;
485
- border-radius:999px;cursor:pointer;font-size:12px;font-weight:700;padding:0 14px;
486
- color:#ffffff!important;transition:all .15s ease;text-transform:uppercase;letter-spacing:.5px;
487
- }
488
- .mode-tab:hover{background:rgba(255,20,147,.12);border-color:rgba(255,20,147,.35)}
489
- .mode-tab.active{background:rgba(255,20,147,.22);border-color:#FF1493;color:#fff!important;box-shadow:0 0 0 2px rgba(255,20,147,.10)}
490
-
491
  .app-main-row{display:flex;gap:0;flex:1;overflow:hidden}
492
  .app-main-left{flex:1;display:flex;flex-direction:column;min-width:0;border-right:1px solid #27272a}
493
  .app-main-right{width:470px;display:flex;flex-direction:column;flex-shrink:0;background:#18181b}
@@ -523,7 +348,7 @@ footer{display:none!important}
523
  overflow:hidden;border:1px solid #27272a;background:#111114;
524
  display:flex;align-items:center;justify-content:center;position:relative;
525
  }
526
- .single-preview-card img,.single-preview-card video{
527
  width:100%;height:100%;max-width:100%;max-height:100%;
528
  object-fit:contain;display:block;background:#000;
529
  }
@@ -757,24 +582,23 @@ function init() {
757
  const fileInput = document.getElementById('custom-file-input');
758
  const previewWrap = document.getElementById('single-preview-wrap');
759
  const previewImg = document.getElementById('single-preview-img');
760
- const previewVideo = document.getElementById('single-preview-video');
761
  const btnUpload = document.getElementById('preview-upload-btn');
762
  const btnClear = document.getElementById('preview-clear-btn');
763
  const promptInput = document.getElementById('custom-query-input');
764
  const runBtnEl = document.getElementById('custom-run-btn');
765
  const outputArea = document.getElementById('custom-output-textarea');
766
  const mediaStatus = document.getElementById('sb-media-status');
767
- const exampleResultContainer = document.getElementById('example-result-data');
768
 
769
- if (!dropZone || !fileInput || !promptInput || !previewWrap || !previewImg || !previewVideo) {
770
  setTimeout(init, 250);
771
  return;
772
  }
773
 
774
  window.__docScopeInitDone = true;
775
  let mediaState = null;
776
- let currentMode = 'image';
777
  let toastTimer = null;
 
 
778
 
779
  function showToast(message, type) {
780
  let toast = document.getElementById('app-toast');
@@ -826,6 +650,13 @@ function init() {
826
  setTimeout(() => outputArea.classList.remove('error-flash'), 800);
827
  }
828
 
 
 
 
 
 
 
 
829
  function setGradioValue(containerId, value) {
830
  const container = document.getElementById(containerId);
831
  if (!container) return;
@@ -841,10 +672,9 @@ function init() {
841
  });
842
  }
843
 
844
- function syncMediaToGradio() {
845
- setGradioValue('hidden-image-b64', mediaState && mediaState.mode === 'image' ? mediaState.b64 : '');
846
- setGradioValue('hidden-video-b64', mediaState && mediaState.mode === 'video' ? mediaState.b64 : '');
847
- const txt = mediaState ? (`1 ${mediaState.mode} uploaded`) : `No ${currentMode} uploaded`;
848
  if (mediaStatus) mediaStatus.textContent = txt;
849
  }
850
 
@@ -856,43 +686,25 @@ function init() {
856
  setGradioValue('hidden-model-name', name);
857
  }
858
 
859
- function syncModeToGradio(mode) {
860
- setGradioValue('hidden-mode-name', mode);
861
- }
862
-
863
  function renderPreview() {
864
  if (!mediaState) {
865
  previewImg.src = '';
866
- previewVideo.src = '';
867
  previewImg.style.display = 'none';
868
- previewVideo.style.display = 'none';
869
  previewWrap.style.display = 'none';
870
  if (uploadPrompt) uploadPrompt.style.display = 'flex';
871
- syncMediaToGradio();
872
  return;
873
  }
874
 
875
- if (mediaState.mode === 'video') {
876
- previewImg.src = '';
877
- previewImg.style.display = 'none';
878
- previewVideo.src = mediaState.b64;
879
- previewVideo.style.display = 'block';
880
- previewWrap.style.display = 'flex';
881
- } else {
882
- previewVideo.pause();
883
- previewVideo.removeAttribute('src');
884
- previewVideo.load();
885
- previewVideo.style.display = 'none';
886
- previewImg.src = mediaState.b64;
887
- previewImg.style.display = 'block';
888
- previewWrap.style.display = 'flex';
889
- }
890
  if (uploadPrompt) uploadPrompt.style.display = 'none';
891
- syncMediaToGradio();
892
  }
893
 
894
- function setPreview(b64, name, mode) {
895
- mediaState = {b64, name: name || 'file', mode: mode || currentMode};
896
  renderPreview();
897
  }
898
  window.__setPreview = setPreview;
@@ -905,40 +717,25 @@ function init() {
905
 
906
  function processFile(file) {
907
  if (!file) return;
908
- if (currentMode === 'image' && !file.type.startsWith('image/')) {
909
- showToast('Only image files are supported in Image mode', 'error');
910
- return;
911
- }
912
- if (currentMode === 'video' && !file.type.startsWith('video/')) {
913
- showToast('Only video files are supported in Video mode', 'error');
914
  return;
915
  }
916
  const reader = new FileReader();
917
- reader.onload = (e) => setPreview(e.target.result, file.name, currentMode);
918
  reader.readAsDataURL(file);
919
  }
920
 
 
 
 
 
921
  fileInput.addEventListener('change', (e) => {
922
  const file = e.target.files && e.target.files[0] ? e.target.files[0] : null;
923
  if (file) processFile(file);
924
  e.target.value = '';
925
  });
926
 
927
- function updateAccept() {
928
- fileInput.accept = currentMode === 'video' ? 'video/*' : 'image/*';
929
- const main = document.getElementById('upload-main-text');
930
- const sub = document.getElementById('upload-sub-text');
931
- if (main) main.textContent = currentMode === 'video' ? 'Click or drag a video here' : 'Click or drag an image here';
932
- if (sub) sub.textContent = currentMode === 'video'
933
- ? 'Upload one short video clip for document-aware video understanding'
934
- : 'Upload one document, page, screenshot, receipt, or scene image for OCR and reasoning';
935
- if (!mediaState && mediaStatus) mediaStatus.textContent = `No ${currentMode} uploaded`;
936
- }
937
-
938
- if (uploadClick) uploadClick.addEventListener('click', () => fileInput.click());
939
- if (btnUpload) btnUpload.addEventListener('click', () => fileInput.click());
940
- if (btnClear) btnClear.addEventListener('click', clearPreview);
941
-
942
  dropZone.addEventListener('dragover', (e) => {
943
  e.preventDefault();
944
  dropZone.classList.add('drag-over');
@@ -963,26 +760,11 @@ function init() {
963
  }
964
  window.__activateModelTab = activateModelTab;
965
 
966
- function activateModeTab(mode) {
967
- currentMode = mode;
968
- document.querySelectorAll('.mode-tab[data-mode]').forEach(btn => {
969
- btn.classList.toggle('active', btn.getAttribute('data-mode') === mode);
970
- });
971
- syncModeToGradio(mode);
972
- updateAccept();
973
- if (mediaState && mediaState.mode !== mode) clearPreview();
974
- }
975
- window.__activateModeTab = activateModeTab;
976
-
977
  document.querySelectorAll('.model-tab[data-model]').forEach(btn => {
978
  btn.addEventListener('click', () => activateModelTab(btn.getAttribute('data-model')));
979
  });
980
- document.querySelectorAll('.mode-tab[data-mode]').forEach(btn => {
981
- btn.addEventListener('click', () => activateModeTab(btn.getAttribute('data-mode')));
982
- });
983
 
984
  activateModelTab('Cosmos-Reason1-7B');
985
- activateModeTab('image');
986
 
987
  function syncSlider(customId, gradioId) {
988
  const slider = document.getElementById(customId);
@@ -1013,16 +795,12 @@ function init() {
1013
  function validateBeforeRun() {
1014
  const promptVal = promptInput.value.trim();
1015
  if (!mediaState && !promptVal) {
1016
- showToast(`Please upload a ${currentMode} and enter your instruction`, 'error');
1017
  flashPromptError();
1018
  return false;
1019
  }
1020
  if (!mediaState) {
1021
- showToast(`Please upload a ${currentMode}`, 'error');
1022
- return false;
1023
- }
1024
- if (mediaState.mode !== currentMode) {
1025
- showToast(`Uploaded media does not match ${currentMode} mode`, 'error');
1026
  return false;
1027
  }
1028
  if (!promptVal) {
@@ -1041,11 +819,9 @@ function init() {
1041
  window.__clickGradioRunBtn = function() {
1042
  if (!validateBeforeRun()) return;
1043
  syncPromptToGradio();
1044
- syncMediaToGradio();
1045
  const activeModel = document.querySelector('.model-tab.active');
1046
  if (activeModel) syncModelToGradio(activeModel.getAttribute('data-model'));
1047
- const activeMode = document.querySelector('.mode-tab.active');
1048
- if (activeMode) syncModeToGradio(activeMode.getAttribute('data-mode'));
1049
  if (outputArea) outputArea.value = '';
1050
  showLoader();
1051
  setTimeout(() => {
@@ -1099,55 +875,86 @@ function init() {
1099
  });
1100
  }
1101
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1102
  document.querySelectorAll('.example-card[data-idx]').forEach(card => {
1103
  card.addEventListener('click', () => {
1104
  const idx = card.getAttribute('data-idx');
1105
  document.querySelectorAll('.example-card.loading').forEach(c => c.classList.remove('loading'));
1106
  card.classList.add('loading');
1107
  showToast('Loading example...', 'info');
 
1108
  setGradioValue('example-result-data', '');
1109
  setGradioValue('example-idx-input', idx);
 
1110
  setTimeout(() => {
1111
  const btn = document.getElementById('example-load-btn');
1112
  if (btn) {
1113
  const b = btn.querySelector('button');
1114
  if (b) b.click(); else btn.click();
1115
  }
1116
- }, 150);
1117
- setTimeout(() => card.classList.remove('loading'), 12000);
1118
  });
1119
  });
1120
 
1121
- function checkExampleResult() {
1122
- if (!exampleResultContainer) return;
1123
- const el = exampleResultContainer.querySelector('textarea') || exampleResultContainer.querySelector('input');
1124
- if (!el || !el.value) return;
1125
- if (window.__lastExampleVal === el.value) return;
1126
- try {
1127
- const data = JSON.parse(el.value);
1128
- if (data.status === 'ok') {
1129
- window.__lastExampleVal = el.value;
1130
- if (data.mode) activateModeTab(data.mode);
1131
- if (data.media) setPreview(data.media, data.name || 'example', data.mode || 'image');
1132
- if (data.query) {
1133
- promptInput.value = data.query;
1134
- syncPromptToGradio();
1135
  }
1136
- if (data.model) activateModelTab(data.model);
1137
- document.querySelectorAll('.example-card.loading').forEach(c => c.classList.remove('loading'));
1138
- showToast('Example loaded', 'info');
1139
- } else if (data.status === 'error') {
1140
- document.querySelectorAll('.example-card.loading').forEach(c => c.classList.remove('loading'));
1141
- showToast(data.message || 'Failed to load example', 'error');
1142
  }
1143
- } catch(e) {}
1144
- }
1145
-
1146
- const obsExample = new MutationObserver(checkExampleResult);
1147
- if (exampleResultContainer) {
1148
- obsExample.observe(exampleResultContainer, {childList:true, subtree:true, characterData:true, attributes:true});
1149
  }
1150
- setInterval(checkExampleResult, 500);
1151
 
1152
  if (outputArea) outputArea.value = '';
1153
  const sb = document.getElementById('sb-run-state');
@@ -1210,15 +1017,8 @@ MODEL_TABS_HTML = "".join([
1210
  for m in MODEL_CHOICES
1211
  ])
1212
 
1213
- MODE_TABS_HTML = """
1214
- <button class="mode-tab active" data-mode="image">Image Inference</button>
1215
- <button class="mode-tab" data-mode="video">Video Inference</button>
1216
- """
1217
-
1218
  with gr.Blocks() as demo:
1219
- hidden_mode_name = gr.Textbox(value="image", elem_id="hidden-mode-name", elem_classes="hidden-input", container=False)
1220
  hidden_image_b64 = gr.Textbox(value="", elem_id="hidden-image-b64", elem_classes="hidden-input", container=False)
1221
- hidden_video_b64 = gr.Textbox(value="", elem_id="hidden-video-b64", elem_classes="hidden-input", container=False)
1222
  prompt = gr.Textbox(value="", elem_id="prompt-gradio-input", elem_classes="hidden-input", container=False)
1223
  hidden_model_name = gr.Textbox(value="Cosmos-Reason1-7B", elem_id="hidden-model-name", elem_classes="hidden-input", container=False)
1224
 
@@ -1250,10 +1050,6 @@ with gr.Blocks() as demo:
1250
  {MODEL_TABS_HTML}
1251
  </div>
1252
 
1253
- <div class="mode-tabs-bar">
1254
- {MODE_TABS_HTML}
1255
- </div>
1256
-
1257
  <div class="app-main-row">
1258
  <div class="app-main-left">
1259
  <div id="media-drop-zone">
@@ -1270,7 +1066,6 @@ with gr.Blocks() as demo:
1270
  <div id="single-preview-wrap" class="single-preview-wrap">
1271
  <div class="single-preview-card">
1272
  <img id="single-preview-img" src="" alt="Preview" style="display:none;">
1273
- <video id="single-preview-video" controls playsinline style="display:none;"></video>
1274
  <div class="preview-overlay-actions">
1275
  <button id="preview-upload-btn" class="preview-action-btn" title="Replace">Upload</button>
1276
  <button id="preview-clear-btn" class="preview-action-btn" title="Clear">Clear</button>
@@ -1280,10 +1075,9 @@ with gr.Blocks() as demo:
1280
  </div>
1281
 
1282
  <div class="hint-bar">
1283
- <b>Upload:</b> Click or drag media into the panel &nbsp;&middot;&nbsp;
1284
- <b>Mode:</b> Switch between image and video inference &nbsp;&middot;&nbsp;
1285
  <b>Model:</b> Change models from the header &nbsp;&middot;&nbsp;
1286
- <kbd>Clear</kbd> removes the current media
1287
  </div>
1288
 
1289
  <div class="examples-section">
@@ -1299,7 +1093,7 @@ with gr.Blocks() as demo:
1299
  <div class="panel-card-title">Vision / OCR Instruction</div>
1300
  <div class="panel-card-body">
1301
  <label class="modern-label" for="custom-query-input">Query Input</label>
1302
- <textarea id="custom-query-input" class="modern-textarea" rows="4" placeholder="e.g., perform OCR on this image, describe the document, explain the ad, summarize the video, identify visible text, analyze the scene..."></textarea>
1303
  </div>
1304
  </div>
1305
 
@@ -1386,11 +1180,9 @@ with gr.Blocks() as demo:
1386
  run_btn.click(
1387
  fn=run_inference,
1388
  inputs=[
1389
- hidden_mode_name,
1390
  hidden_model_name,
1391
  prompt,
1392
  hidden_image_b64,
1393
- hidden_video_b64,
1394
  max_new_tokens,
1395
  temperature,
1396
  top_p,
@@ -1399,30 +1191,20 @@ with gr.Blocks() as demo:
1399
  gpu_duration_state,
1400
  ],
1401
  outputs=[result],
1402
- js=r"""(mode, model, p, img, vid, mnt, t, tp, tk, rp, gd) => {
1403
  const modelEl = document.querySelector('.model-tab.active');
1404
- const modeEl = document.querySelector('.mode-tab.active');
1405
  const modelVal = modelEl ? modelEl.getAttribute('data-model') : model;
1406
- const modeVal = modeEl ? modeEl.getAttribute('data-mode') : mode;
1407
  const promptEl = document.getElementById('custom-query-input');
1408
  const promptVal = promptEl ? promptEl.value : p;
1409
 
1410
  let imgVal = img;
1411
- let vidVal = vid;
1412
-
1413
  const imgContainer = document.getElementById('hidden-image-b64');
1414
- const vidContainer = document.getElementById('hidden-video-b64');
1415
-
1416
  if (imgContainer) {
1417
  const inner = imgContainer.querySelector('textarea, input');
1418
  if (inner) imgVal = inner.value;
1419
  }
1420
- if (vidContainer) {
1421
- const inner = vidContainer.querySelector('textarea, input');
1422
- if (inner) vidVal = inner.value;
1423
- }
1424
 
1425
- return [modeVal, modelVal, promptVal, imgVal, vidVal, mnt, t, tp, tk, rp, gd];
1426
  }""",
1427
  )
1428
 
@@ -1439,5 +1221,5 @@ if __name__ == "__main__":
1439
  mcp_server=True,
1440
  ssr_mode=False,
1441
  show_error=True,
1442
- allowed_paths=["images", "videos"],
1443
  )
 
1
  import os
2
  import gc
3
  import json
 
4
  import time
5
  import base64
6
  from io import BytesIO
 
9
  import gradio as gr
10
  import spaces
11
  import torch
 
12
  from PIL import Image
13
  import cv2
14
 
15
  from transformers import (
 
16
  Qwen2_5_VLForConditionalGeneration,
17
  AutoProcessor,
18
  TextIteratorStreamer,
 
71
  MODEL_CHOICES = list(MODEL_MAP.keys())
72
 
73
  image_examples = [
74
+ {"query": "Perform OCR on the text in the image.", "media": "images/1.jpg", "model": "docscopeOCR-7B-050425-exp"},
75
+ {"query": "Explain the scene in detail.", "media": "images/2.jpg", "model": "Cosmos-Reason1-7B"},
76
  ]
77
 
 
 
 
 
 
 
 
78
 
79
  def pil_to_data_url(img: Image.Image, fmt="PNG"):
80
  buf = BytesIO()
 
93
  "jpeg": "image/jpeg",
94
  "png": "image/png",
95
  "webp": "image/webp",
96
+ }.get(ext, "image/jpeg")
 
 
 
97
  with open(path, "rb") as f:
98
  data = base64.b64encode(f.read()).decode()
99
  return f"data:{mime};base64,{data}"
100
 
101
 
102
+ def make_thumb_b64(path, max_dim=240):
103
  try:
104
+ img = Image.open(path).convert("RGB")
 
 
 
 
 
 
 
 
 
105
  img.thumbnail((max_dim, max_dim))
106
  return pil_to_data_url(img, "JPEG")
107
  except Exception as e:
 
111
 
112
  def build_example_cards_html():
113
  cards = ""
114
+ for i, ex in enumerate(image_examples):
115
+ thumb = make_thumb_b64(ex["media"])
116
  prompt_short = ex["query"][:72] + ("..." if len(ex["query"]) > 72 else "")
 
117
  cards += f"""
118
  <div class="example-card" data-idx="{i}">
119
  <div class="example-thumb-wrap">
120
  {"<img src='" + thumb + "' alt=''>" if thumb else "<div class='example-thumb-placeholder'>Preview</div>"}
121
+ <div class="example-media-chip">IMAGE</div>
122
  </div>
123
  <div class="example-meta-row">
124
  <span class="example-badge">{ex["model"]}</span>
 
137
  idx = int(float(idx_str))
138
  except Exception:
139
  return json.dumps({"status": "error", "message": "Invalid example index"})
140
+ if idx < 0 or idx >= len(image_examples):
141
  return json.dumps({"status": "error", "message": "Example index out of range"})
142
+ ex = image_examples[idx]
143
  media_b64 = file_to_data_url(ex["media"])
144
  if not media_b64:
145
+ return json.dumps({"status": "error", "message": "Could not load example image"})
146
  return json.dumps({
147
  "status": "ok",
148
  "query": ex["query"],
149
  "media": media_b64,
150
  "model": ex["model"],
 
151
  "name": os.path.basename(ex["media"]),
152
  })
153
 
 
166
  return None
167
 
168
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
169
  def calc_timeout_image(model_name, text, image, max_new_tokens, temperature, top_p, top_k, repetition_penalty, gpu_timeout):
170
  try:
171
  return int(gpu_timeout)
 
173
  return 60
174
 
175
 
 
 
 
 
 
 
 
176
  @spaces.GPU(duration=calc_timeout_image)
177
  def generate_image(model_name, text, image, max_new_tokens=1024, temperature=0.6, top_p=0.9, top_k=50, repetition_penalty=1.2, gpu_timeout=60):
178
  if not model_name or model_name not in MODEL_MAP:
 
235
  torch.cuda.empty_cache()
236
 
237
 
238
+ def run_inference(model_name, text, image_b64, max_new_tokens_v, temperature_v, top_p_v, top_k_v, repetition_penalty_v, gpu_timeout_v):
239
+ image = b64_to_pil(image_b64)
240
+ yield from generate_image(
241
+ model_name=model_name,
242
+ text=text,
243
+ image=image,
244
+ max_new_tokens=max_new_tokens_v,
245
+ temperature=temperature_v,
246
+ top_p=top_p_v,
247
+ top_k=top_k_v,
248
+ repetition_penalty=repetition_penalty_v,
249
+ gpu_timeout=gpu_timeout_v,
250
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
251
 
252
 
253
  def noop():
 
313
  .model-tab.active{background:rgba(255,20,147,.22);border-color:#FF1493;color:#fff!important;box-shadow:0 0 0 2px rgba(255,20,147,.10)}
314
  .model-tab-label{font-size:12px;color:#ffffff!important;font-weight:600}
315
 
 
 
 
 
 
 
 
 
 
 
 
 
 
316
  .app-main-row{display:flex;gap:0;flex:1;overflow:hidden}
317
  .app-main-left{flex:1;display:flex;flex-direction:column;min-width:0;border-right:1px solid #27272a}
318
  .app-main-right{width:470px;display:flex;flex-direction:column;flex-shrink:0;background:#18181b}
 
348
  overflow:hidden;border:1px solid #27272a;background:#111114;
349
  display:flex;align-items:center;justify-content:center;position:relative;
350
  }
351
+ .single-preview-card img{
352
  width:100%;height:100%;max-width:100%;max-height:100%;
353
  object-fit:contain;display:block;background:#000;
354
  }
 
582
  const fileInput = document.getElementById('custom-file-input');
583
  const previewWrap = document.getElementById('single-preview-wrap');
584
  const previewImg = document.getElementById('single-preview-img');
 
585
  const btnUpload = document.getElementById('preview-upload-btn');
586
  const btnClear = document.getElementById('preview-clear-btn');
587
  const promptInput = document.getElementById('custom-query-input');
588
  const runBtnEl = document.getElementById('custom-run-btn');
589
  const outputArea = document.getElementById('custom-output-textarea');
590
  const mediaStatus = document.getElementById('sb-media-status');
 
591
 
592
+ if (!dropZone || !fileInput || !promptInput || !previewWrap || !previewImg) {
593
  setTimeout(init, 250);
594
  return;
595
  }
596
 
597
  window.__docScopeInitDone = true;
598
  let mediaState = null;
 
599
  let toastTimer = null;
600
+ let examplePoller = null;
601
+ let lastSeenExamplePayload = null;
602
 
603
  function showToast(message, type) {
604
  let toast = document.getElementById('app-toast');
 
650
  setTimeout(() => outputArea.classList.remove('error-flash'), 800);
651
  }
652
 
653
+ function getValueFromContainer(containerId) {
654
+ const container = document.getElementById(containerId);
655
+ if (!container) return '';
656
+ const el = container.querySelector('textarea, input');
657
+ return el ? (el.value || '') : '';
658
+ }
659
+
660
  function setGradioValue(containerId, value) {
661
  const container = document.getElementById(containerId);
662
  if (!container) return;
 
672
  });
673
  }
674
 
675
+ function syncImageToGradio() {
676
+ setGradioValue('hidden-image-b64', mediaState ? mediaState.b64 : '');
677
+ const txt = mediaState ? '1 image uploaded' : 'No image uploaded';
 
678
  if (mediaStatus) mediaStatus.textContent = txt;
679
  }
680
 
 
686
  setGradioValue('hidden-model-name', name);
687
  }
688
 
 
 
 
 
689
  function renderPreview() {
690
  if (!mediaState) {
691
  previewImg.src = '';
 
692
  previewImg.style.display = 'none';
 
693
  previewWrap.style.display = 'none';
694
  if (uploadPrompt) uploadPrompt.style.display = 'flex';
695
+ syncImageToGradio();
696
  return;
697
  }
698
 
699
+ previewImg.src = mediaState.b64;
700
+ previewImg.style.display = 'block';
701
+ previewWrap.style.display = 'flex';
 
 
 
 
 
 
 
 
 
 
 
 
702
  if (uploadPrompt) uploadPrompt.style.display = 'none';
703
+ syncImageToGradio();
704
  }
705
 
706
+ function setPreview(b64, name) {
707
+ mediaState = {b64, name: name || 'file'};
708
  renderPreview();
709
  }
710
  window.__setPreview = setPreview;
 
717
 
718
  function processFile(file) {
719
  if (!file) return;
720
+ if (!file.type.startsWith('image/')) {
721
+ showToast('Only image files are supported', 'error');
 
 
 
 
722
  return;
723
  }
724
  const reader = new FileReader();
725
+ reader.onload = (e) => setPreview(e.target.result, file.name);
726
  reader.readAsDataURL(file);
727
  }
728
 
729
+ if (uploadClick) uploadClick.addEventListener('click', () => fileInput.click());
730
+ if (btnUpload) btnUpload.addEventListener('click', () => fileInput.click());
731
+ if (btnClear) btnClear.addEventListener('click', clearPreview);
732
+
733
  fileInput.addEventListener('change', (e) => {
734
  const file = e.target.files && e.target.files[0] ? e.target.files[0] : null;
735
  if (file) processFile(file);
736
  e.target.value = '';
737
  });
738
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
739
  dropZone.addEventListener('dragover', (e) => {
740
  e.preventDefault();
741
  dropZone.classList.add('drag-over');
 
760
  }
761
  window.__activateModelTab = activateModelTab;
762
 
 
 
 
 
 
 
 
 
 
 
 
763
  document.querySelectorAll('.model-tab[data-model]').forEach(btn => {
764
  btn.addEventListener('click', () => activateModelTab(btn.getAttribute('data-model')));
765
  });
 
 
 
766
 
767
  activateModelTab('Cosmos-Reason1-7B');
 
768
 
769
  function syncSlider(customId, gradioId) {
770
  const slider = document.getElementById(customId);
 
795
  function validateBeforeRun() {
796
  const promptVal = promptInput.value.trim();
797
  if (!mediaState && !promptVal) {
798
+ showToast('Please upload an image and enter your instruction', 'error');
799
  flashPromptError();
800
  return false;
801
  }
802
  if (!mediaState) {
803
+ showToast('Please upload an image', 'error');
 
 
 
 
804
  return false;
805
  }
806
  if (!promptVal) {
 
819
  window.__clickGradioRunBtn = function() {
820
  if (!validateBeforeRun()) return;
821
  syncPromptToGradio();
822
+ syncImageToGradio();
823
  const activeModel = document.querySelector('.model-tab.active');
824
  if (activeModel) syncModelToGradio(activeModel.getAttribute('data-model'));
 
 
825
  if (outputArea) outputArea.value = '';
826
  showLoader();
827
  setTimeout(() => {
 
875
  });
876
  }
877
 
878
+ function applyExamplePayload(raw) {
879
+ try {
880
+ const data = JSON.parse(raw);
881
+ if (data.status === 'ok') {
882
+ if (data.media) setPreview(data.media, data.name || 'example_file');
883
+ if (data.query) {
884
+ promptInput.value = data.query;
885
+ syncPromptToGradio();
886
+ }
887
+ if (data.model) activateModelTab(data.model);
888
+ document.querySelectorAll('.example-card.loading').forEach(c => c.classList.remove('loading'));
889
+ showToast('Example loaded', 'info');
890
+ } else if (data.status === 'error') {
891
+ document.querySelectorAll('.example-card.loading').forEach(c => c.classList.remove('loading'));
892
+ showToast(data.message || 'Failed to load example', 'error');
893
+ }
894
+ } catch (e) {
895
+ document.querySelectorAll('.example-card.loading').forEach(c => c.classList.remove('loading'));
896
+ showToast('Failed to parse example data', 'error');
897
+ }
898
+ }
899
+
900
+ function startExamplePolling() {
901
+ if (examplePoller) clearInterval(examplePoller);
902
+ let attempts = 0;
903
+ examplePoller = setInterval(() => {
904
+ attempts += 1;
905
+ const current = getValueFromContainer('example-result-data');
906
+ if (current && current !== lastSeenExamplePayload) {
907
+ lastSeenExamplePayload = current;
908
+ clearInterval(examplePoller);
909
+ examplePoller = null;
910
+ applyExamplePayload(current);
911
+ return;
912
+ }
913
+ if (attempts >= 80) {
914
+ clearInterval(examplePoller);
915
+ examplePoller = null;
916
+ document.querySelectorAll('.example-card.loading').forEach(c => c.classList.remove('loading'));
917
+ showToast('Example load timed out', 'error');
918
+ }
919
+ }, 150);
920
+ }
921
+
922
  document.querySelectorAll('.example-card[data-idx]').forEach(card => {
923
  card.addEventListener('click', () => {
924
  const idx = card.getAttribute('data-idx');
925
  document.querySelectorAll('.example-card.loading').forEach(c => c.classList.remove('loading'));
926
  card.classList.add('loading');
927
  showToast('Loading example...', 'info');
928
+
929
  setGradioValue('example-result-data', '');
930
  setGradioValue('example-idx-input', idx);
931
+
932
  setTimeout(() => {
933
  const btn = document.getElementById('example-load-btn');
934
  if (btn) {
935
  const b = btn.querySelector('button');
936
  if (b) b.click(); else btn.click();
937
  }
938
+ startExamplePolling();
939
+ }, 220);
940
  });
941
  });
942
 
943
+ const observerTarget = document.getElementById('example-result-data');
944
+ if (observerTarget) {
945
+ const obs = new MutationObserver(() => {
946
+ const current = getValueFromContainer('example-result-data');
947
+ if (current && current !== lastSeenExamplePayload) {
948
+ lastSeenExamplePayload = current;
949
+ if (examplePoller) {
950
+ clearInterval(examplePoller);
951
+ examplePoller = null;
 
 
 
 
 
952
  }
953
+ applyExamplePayload(current);
 
 
 
 
 
954
  }
955
+ });
956
+ obs.observe(observerTarget, {childList:true, subtree:true, characterData:true, attributes:true});
 
 
 
 
957
  }
 
958
 
959
  if (outputArea) outputArea.value = '';
960
  const sb = document.getElementById('sb-run-state');
 
1017
  for m in MODEL_CHOICES
1018
  ])
1019
 
 
 
 
 
 
1020
  with gr.Blocks() as demo:
 
1021
  hidden_image_b64 = gr.Textbox(value="", elem_id="hidden-image-b64", elem_classes="hidden-input", container=False)
 
1022
  prompt = gr.Textbox(value="", elem_id="prompt-gradio-input", elem_classes="hidden-input", container=False)
1023
  hidden_model_name = gr.Textbox(value="Cosmos-Reason1-7B", elem_id="hidden-model-name", elem_classes="hidden-input", container=False)
1024
 
 
1050
  {MODEL_TABS_HTML}
1051
  </div>
1052
 
 
 
 
 
1053
  <div class="app-main-row">
1054
  <div class="app-main-left">
1055
  <div id="media-drop-zone">
 
1066
  <div id="single-preview-wrap" class="single-preview-wrap">
1067
  <div class="single-preview-card">
1068
  <img id="single-preview-img" src="" alt="Preview" style="display:none;">
 
1069
  <div class="preview-overlay-actions">
1070
  <button id="preview-upload-btn" class="preview-action-btn" title="Replace">Upload</button>
1071
  <button id="preview-clear-btn" class="preview-action-btn" title="Clear">Clear</button>
 
1075
  </div>
1076
 
1077
  <div class="hint-bar">
1078
+ <b>Upload:</b> Click or drag an image into the panel &nbsp;&middot;&nbsp;
 
1079
  <b>Model:</b> Change models from the header &nbsp;&middot;&nbsp;
1080
+ <kbd>Clear</kbd> removes the current image
1081
  </div>
1082
 
1083
  <div class="examples-section">
 
1093
  <div class="panel-card-title">Vision / OCR Instruction</div>
1094
  <div class="panel-card-body">
1095
  <label class="modern-label" for="custom-query-input">Query Input</label>
1096
+ <textarea id="custom-query-input" class="modern-textarea" rows="4" placeholder="e.g., perform OCR on this image, describe the document, identify visible text, analyze the scene..."></textarea>
1097
  </div>
1098
  </div>
1099
 
 
1180
  run_btn.click(
1181
  fn=run_inference,
1182
  inputs=[
 
1183
  hidden_model_name,
1184
  prompt,
1185
  hidden_image_b64,
 
1186
  max_new_tokens,
1187
  temperature,
1188
  top_p,
 
1191
  gpu_duration_state,
1192
  ],
1193
  outputs=[result],
1194
+ js=r"""(model, p, img, mnt, t, tp, tk, rp, gd) => {
1195
  const modelEl = document.querySelector('.model-tab.active');
 
1196
  const modelVal = modelEl ? modelEl.getAttribute('data-model') : model;
 
1197
  const promptEl = document.getElementById('custom-query-input');
1198
  const promptVal = promptEl ? promptEl.value : p;
1199
 
1200
  let imgVal = img;
 
 
1201
  const imgContainer = document.getElementById('hidden-image-b64');
 
 
1202
  if (imgContainer) {
1203
  const inner = imgContainer.querySelector('textarea, input');
1204
  if (inner) imgVal = inner.value;
1205
  }
 
 
 
 
1206
 
1207
+ return [modelVal, promptVal, imgVal, mnt, t, tp, tk, rp, gd];
1208
  }""",
1209
  )
1210
 
 
1221
  mcp_server=True,
1222
  ssr_mode=False,
1223
  show_error=True,
1224
+ allowed_paths=["images"],
1225
  )