prithivMLmods commited on
Commit
678e058
·
verified ·
1 Parent(s): bba5db0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +1298 -340
app.py CHANGED
@@ -1,11 +1,11 @@
1
  import os
2
- import random
3
- import uuid
4
  import json
 
5
  import time
6
- import asyncio
 
7
  from threading import Thread
8
- from typing import Iterable
9
 
10
  import gradio as gr
11
  import spaces
@@ -13,204 +13,20 @@ import torch
13
  import numpy as np
14
  from PIL import Image
15
  import cv2
16
- import requests
17
 
18
  from transformers import (
19
  Qwen2VLForConditionalGeneration,
20
  Qwen2_5_VLForConditionalGeneration,
21
  AutoProcessor,
22
  TextIteratorStreamer,
23
- AutoModel,
24
- AutoTokenizer,
25
- )
26
- from transformers.image_utils import load_image
27
- from gradio.themes import Soft
28
- from gradio.themes.utils import colors, fonts, sizes
29
-
30
- colors.steel_blue = colors.Color(
31
- name="steel_blue",
32
- c50="#EBF3F8",
33
- c100="#D3E5F0",
34
- c200="#A8CCE1",
35
- c300="#7DB3D2",
36
- c400="#529AC3",
37
- c500="#4682B4",
38
- c600="#3E72A0",
39
- c700="#36638C",
40
- c800="#2E5378",
41
- c900="#264364",
42
- c950="#1E3450",
43
  )
44
 
45
- class SteelBlueTheme(Soft):
46
- def __init__(
47
- self,
48
- *,
49
- primary_hue: colors.Color | str = colors.gray,
50
- secondary_hue: colors.Color | str = colors.steel_blue,
51
- neutral_hue: colors.Color | str = colors.slate,
52
- text_size: sizes.Size | str = sizes.text_lg,
53
- font: fonts.Font | str | Iterable[fonts.Font | str] = (
54
- fonts.GoogleFont("Outfit"), "Arial", "sans-serif",
55
- ),
56
- font_mono: fonts.Font | str | Iterable[fonts.Font | str] = (
57
- fonts.GoogleFont("IBM Plex Mono"), "ui-monospace", "monospace",
58
- ),
59
- ):
60
- super().__init__(
61
- primary_hue=primary_hue,
62
- secondary_hue=secondary_hue,
63
- neutral_hue=neutral_hue,
64
- text_size=text_size,
65
- font=font,
66
- font_mono=font_mono,
67
- )
68
- super().set(
69
- background_fill_primary="*primary_50",
70
- background_fill_primary_dark="*primary_900",
71
- body_background_fill="linear-gradient(135deg, *primary_200, *primary_100)",
72
- body_background_fill_dark="linear-gradient(135deg, *primary_900, *primary_800)",
73
- button_primary_text_color="white",
74
- button_primary_text_color_hover="white",
75
- button_primary_background_fill="linear-gradient(90deg, *secondary_500, *secondary_600)",
76
- button_primary_background_fill_hover="linear-gradient(90deg, *secondary_600, *secondary_700)",
77
- button_primary_background_fill_dark="linear-gradient(90deg, *secondary_600, *secondary_800)",
78
- button_primary_background_fill_hover_dark="linear-gradient(90deg, *secondary_500, *secondary_500)",
79
- slider_color="*secondary_500",
80
- slider_color_dark="*secondary_600",
81
- block_title_text_weight="600",
82
- block_border_width="3px",
83
- block_shadow="*shadow_drop_lg",
84
- button_primary_shadow="*shadow_drop_lg",
85
- button_large_padding="11px",
86
- color_accent_soft="*primary_100",
87
- block_label_background_fill="*primary_200",
88
- )
89
-
90
- steel_blue_theme = SteelBlueTheme()
91
-
92
- css = """
93
- #main-title h1 {
94
- font-size: 2.3em !important;
95
- }
96
- #output-title h2 {
97
- font-size: 2.2em !important;
98
- }
99
-
100
- /* RadioAnimated Styles */
101
- .ra-wrap{ width: fit-content; }
102
- .ra-inner{
103
- position: relative; display: inline-flex; align-items: center; gap: 0; padding: 6px;
104
- background: var(--neutral-200); border-radius: 9999px; overflow: hidden;
105
- }
106
- .ra-input{ display: none; }
107
- .ra-label{
108
- position: relative; z-index: 2; padding: 8px 16px;
109
- font-family: inherit; font-size: 14px; font-weight: 600;
110
- color: var(--neutral-500); cursor: pointer; transition: color 0.2s; white-space: nowrap;
111
- }
112
- .ra-highlight{
113
- position: absolute; z-index: 1; top: 6px; left: 6px;
114
- height: calc(100% - 12px); border-radius: 9999px;
115
- background: white; box-shadow: 0 2px 4px rgba(0,0,0,0.1);
116
- transition: transform 0.2s, width 0.2s;
117
- }
118
- .ra-input:checked + .ra-label{ color: black; }
119
-
120
- /* Dark mode adjustments for Radio */
121
- .dark .ra-inner { background: var(--neutral-800); }
122
- .dark .ra-label { color: var(--neutral-400); }
123
- .dark .ra-highlight { background: var(--neutral-600); }
124
- .dark .ra-input:checked + .ra-label { color: white; }
125
-
126
- #gpu-duration-container {
127
- padding: 10px;
128
- border-radius: 8px;
129
- background: var(--background-fill-secondary);
130
- border: 1px solid var(--border-color-primary);
131
- margin-top: 10px;
132
- }
133
- """
134
-
135
  MAX_MAX_NEW_TOKENS = 2048
136
  DEFAULT_MAX_NEW_TOKENS = 1024
137
  MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
138
 
139
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
140
-
141
- class RadioAnimated(gr.HTML):
142
- def __init__(self, choices, value=None, **kwargs):
143
- if not choices or len(choices) < 2:
144
- raise ValueError("RadioAnimated requires at least 2 choices.")
145
- if value is None:
146
- value = choices[0]
147
-
148
- uid = uuid.uuid4().hex[:8]
149
- group_name = f"ra-{uid}"
150
-
151
- inputs_html = "\n".join(
152
- f"""
153
- <input class="ra-input" type="radio" name="{group_name}" id="{group_name}-{i}" value="{c}">
154
- <label class="ra-label" for="{group_name}-{i}">{c}</label>
155
- """
156
- for i, c in enumerate(choices)
157
- )
158
-
159
- html_template = f"""
160
- <div class="ra-wrap" data-ra="{uid}">
161
- <div class="ra-inner">
162
- <div class="ra-highlight"></div>
163
- {inputs_html}
164
- </div>
165
- </div>
166
- """
167
-
168
- js_on_load = r"""
169
- (() => {
170
- const wrap = element.querySelector('.ra-wrap');
171
- const inner = element.querySelector('.ra-inner');
172
- const highlight = element.querySelector('.ra-highlight');
173
- const inputs = Array.from(element.querySelectorAll('.ra-input'));
174
-
175
- if (!inputs.length) return;
176
-
177
- const choices = inputs.map(i => i.value);
178
-
179
- function setHighlightByIndex(idx) {
180
- const n = choices.length;
181
- const pct = 100 / n;
182
- highlight.style.width = `calc(${pct}% - 6px)`;
183
- highlight.style.transform = `translateX(${idx * 100}%)`;
184
- }
185
-
186
- function setCheckedByValue(val, shouldTrigger=false) {
187
- const idx = Math.max(0, choices.indexOf(val));
188
- inputs.forEach((inp, i) => { inp.checked = (i === idx); });
189
- setHighlightByIndex(idx);
190
-
191
- props.value = choices[idx];
192
- if (shouldTrigger) trigger('change', props.value);
193
- }
194
-
195
- setCheckedByValue(props.value ?? choices[0], false);
196
-
197
- inputs.forEach((inp) => {
198
- inp.addEventListener('change', () => {
199
- setCheckedByValue(inp.value, true);
200
- });
201
- });
202
- })();
203
- """
204
-
205
- super().__init__(
206
- value=value,
207
- html_template=html_template,
208
- js_on_load=js_on_load,
209
- **kwargs
210
- )
211
-
212
- def apply_gpu_duration(val: str):
213
- return int(val)
214
 
215
  MODEL_ID_M = "nvidia/Cosmos-Reason1-7B"
216
  processor_m = AutoProcessor.from_pretrained(MODEL_ID_M, trust_remote_code=True)
@@ -248,72 +64,206 @@ model_v = Qwen2_5_VLForConditionalGeneration.from_pretrained(
248
  torch_dtype=torch.float16
249
  ).to(device).eval()
250
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
251
  def downsample_video(video_path):
252
- """
253
- Downsamples the video to evenly spaced frames.
254
- Each frame is returned as a PIL image along with its timestamp.
255
- """
256
  vidcap = cv2.VideoCapture(video_path)
257
  total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
258
- fps = vidcap.get(cv2.CAP_PROP_FPS)
259
  frames = []
260
- frame_indices = np.linspace(0, total_frames - 1, min(total_frames, 10), dtype=int)
 
 
 
 
261
  for i in frame_indices:
262
- vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
263
  success, image = vidcap.read()
264
  if success:
265
  image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
266
  pil_image = Image.fromarray(image)
267
- timestamp = round(i / fps, 2)
268
  frames.append((pil_image, timestamp))
269
  vidcap.release()
270
  return frames
271
 
272
- def calc_timeout_image(model_name: str, text: str, image: Image.Image,
273
- max_new_tokens: int, temperature: float, top_p: float,
274
- top_k: int, repetition_penalty: float, gpu_timeout: int):
275
- """Calculate GPU timeout duration for image inference."""
276
  try:
277
  return int(gpu_timeout)
278
- except:
279
  return 60
280
 
281
- def calc_timeout_video(model_name: str, text: str, video_path: str,
282
- max_new_tokens: int, temperature: float, top_p: float,
283
- top_k: int, repetition_penalty: float, gpu_timeout: int):
284
- """Calculate GPU timeout duration for video inference."""
285
  try:
286
  return int(gpu_timeout)
287
- except:
288
  return 60
289
 
290
- @spaces.GPU(duration=calc_timeout_image)
291
- def generate_image(model_name: str, text: str, image: Image.Image,
292
- max_new_tokens: int = 1024,
293
- temperature: float = 0.6,
294
- top_p: float = 0.9,
295
- top_k: int = 50,
296
- repetition_penalty: float = 1.2,
297
- gpu_timeout: int = 60):
298
- """
299
- Generates responses using the selected model for image input.
300
- Yields raw text and Markdown-formatted text.
301
- """
302
- if model_name == "Cosmos-Reason1-7B":
303
- processor, model = processor_m, model_m
304
- elif model_name == "docscopeOCR-7B-050425-exp":
305
- processor, model = processor_x, model_x
306
- elif model_name == "Captioner-7B-Qwen2.5VL":
307
- processor, model = processor_z, model_z
308
- elif model_name == "visionOCR-3B":
309
- processor, model = processor_v, model_v
310
- else:
311
- yield "Invalid model selected.", "Invalid model selected."
312
- return
313
 
 
 
 
 
314
  if image is None:
315
- yield "Please upload an image.", "Please upload an image."
316
- return
 
 
 
 
 
317
 
318
  messages = [{
319
  "role": "user",
@@ -322,7 +272,13 @@ def generate_image(model_name: str, text: str, image: Image.Image,
322
  {"type": "text", "text": text},
323
  ]
324
  }]
325
- prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
 
 
 
 
 
 
326
  inputs = processor(
327
  text=[prompt_full],
328
  images=[image],
@@ -331,53 +287,58 @@ def generate_image(model_name: str, text: str, image: Image.Image,
331
  truncation=True,
332
  max_length=MAX_INPUT_TOKEN_LENGTH
333
  ).to(device)
 
334
  streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
335
- generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
 
 
 
 
 
 
 
 
 
 
336
  thread = Thread(target=model.generate, kwargs=generation_kwargs)
337
  thread.start()
 
338
  buffer = ""
339
  for new_text in streamer:
340
- buffer += new_text
341
  time.sleep(0.01)
342
- yield buffer, buffer
343
 
344
- @spaces.GPU(duration=calc_timeout_video)
345
- def generate_video(model_name: str, text: str, video_path: str,
346
- max_new_tokens: int = 1024,
347
- temperature: float = 0.6,
348
- top_p: float = 0.9,
349
- top_k: int = 50,
350
- repetition_penalty: float = 1.2,
351
- gpu_timeout: int = 90):
352
- """
353
- Generates responses using the selected model for video input.
354
- Yields raw text and Markdown-formatted text.
355
- """
356
- if model_name == "Cosmos-Reason1-7B":
357
- processor, model = processor_m, model_m
358
- elif model_name == "docscopeOCR-7B-050425-exp":
359
- processor, model = processor_x, model_x
360
- elif model_name == "Captioner-7B-Qwen2.5VL":
361
- processor, model = processor_z, model_z
362
- elif model_name == "visionOCR-3B":
363
- processor, model = processor_v, model_v
364
- else:
365
- yield "Invalid model selected.", "Invalid model selected."
366
- return
367
 
368
- if video_path is None:
369
- yield "Please upload a video.", "Please upload a video."
370
- return
371
 
 
 
 
 
 
 
 
 
 
 
 
 
372
  frames = downsample_video(video_path)
 
 
 
373
  messages = [
374
  {"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]},
375
  {"role": "user", "content": [{"type": "text", "text": text}]}
376
  ]
377
- for frame in frames:
378
- image, timestamp = frame
379
  messages[1]["content"].append({"type": "text", "text": f"Frame {timestamp}:"})
380
  messages[1]["content"].append({"type": "image", "image": image})
 
381
  inputs = processor.apply_chat_template(
382
  messages,
383
  tokenize=True,
@@ -387,99 +348,1096 @@ def generate_video(model_name: str, text: str, video_path: str,
387
  truncation=True,
388
  max_length=MAX_INPUT_TOKEN_LENGTH
389
  ).to(device)
 
390
  streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
391
  generation_kwargs = {
392
  **inputs,
393
  "streamer": streamer,
394
- "max_new_tokens": max_new_tokens,
395
  "do_sample": True,
396
- "temperature": temperature,
397
- "top_p": top_p,
398
- "top_k": top_k,
399
- "repetition_penalty": repetition_penalty,
400
  }
 
401
  thread = Thread(target=model.generate, kwargs=generation_kwargs)
402
  thread.start()
 
403
  buffer = ""
404
  for new_text in streamer:
405
- buffer += new_text
406
  time.sleep(0.01)
407
- yield buffer, buffer
408
 
409
- image_examples = [
410
- ["Perform OCR on the text in the image.", "images/1.jpg"],
411
- ["Explain the scene in detail.", "images/2.jpg"]
412
- ]
413
 
414
- video_examples = [
415
- ["Explain the Ad in Detail", "videos/1.mp4"],
416
- ["Identify the main actions in the video", "videos/2.mp4"]
417
- ]
418
 
419
- with gr.Blocks() as demo:
420
- gr.Markdown("# **DocScope R1**", elem_id="main-title")
421
- with gr.Row():
422
- with gr.Column(scale=2):
423
- with gr.Tabs():
424
- with gr.TabItem("Image Inference"):
425
- image_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
426
- image_upload = gr.Image(type="pil", label="Upload Image", height=290)
427
- image_submit = gr.Button("Submit", variant="primary")
428
- gr.Examples(examples=image_examples, inputs=[image_query, image_upload])
429
- with gr.TabItem("Video Inference"):
430
- video_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
431
- video_upload = gr.Video(label="Upload Video", height=290)
432
- video_submit = gr.Button("Submit", variant="primary")
433
- gr.Examples(examples=video_examples, inputs=[video_query, video_upload])
434
-
435
- with gr.Accordion("Advanced options", open=False):
436
- max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
437
- temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
438
- top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9)
439
- top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
440
- repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
441
-
442
- with gr.Column(scale=3):
443
- gr.Markdown("## Output", elem_id="output-title")
444
- raw_output = gr.Textbox(label="Raw Output Stream", interactive=True, lines=11)
445
- with gr.Accordion("(Result.md)", open=False):
446
- markdown_output = gr.Markdown()
447
-
448
- model_choice = gr.Radio(
449
- choices=["Cosmos-Reason1-7B", "docscopeOCR-7B-050425-exp", "Captioner-7B-Qwen2.5VL", "visionOCR-3B"],
450
- label="Select Model",
451
- value="Cosmos-Reason1-7B"
452
  )
453
-
454
- with gr.Row(elem_id="gpu-duration-container"):
455
- with gr.Column():
456
- gr.Markdown("**GPU Duration (seconds)**")
457
- radioanimated_gpu_duration = RadioAnimated(
458
- choices=["60", "90", "120", "180", "240", "300"],
459
- value="60",
460
- elem_id="radioanimated_gpu_duration"
461
- )
462
- gpu_duration_state = gr.Number(value=60, visible=False)
463
-
464
- gr.Markdown("*Note: Higher GPU duration allows for longer processing but consumes more GPU quota.*")
465
-
466
- radioanimated_gpu_duration.change(
467
- fn=apply_gpu_duration,
468
- inputs=radioanimated_gpu_duration,
469
- outputs=[gpu_duration_state],
470
- api_visibility="private"
471
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
472
 
473
- image_submit.click(
474
- fn=generate_image,
475
- inputs=[model_choice, image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty, gpu_duration_state],
476
- outputs=[raw_output, markdown_output]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
477
  )
478
- video_submit.click(
479
- fn=generate_video,
480
- inputs=[model_choice, video_query, video_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty, gpu_duration_state],
481
- outputs=[raw_output, markdown_output]
 
 
482
  )
483
 
484
  if __name__ == "__main__":
485
- demo.queue(max_size=30).launch(css=css, theme=steel_blue_theme, mcp_server=True, ssr_mode=False, show_error=True)
 
 
 
 
 
 
 
1
  import os
2
+ import gc
 
3
  import json
4
+ import uuid
5
  import time
6
+ import base64
7
+ from io import BytesIO
8
  from threading import Thread
 
9
 
10
  import gradio as gr
11
  import spaces
 
13
  import numpy as np
14
  from PIL import Image
15
  import cv2
 
16
 
17
  from transformers import (
18
  Qwen2VLForConditionalGeneration,
19
  Qwen2_5_VLForConditionalGeneration,
20
  AutoProcessor,
21
  TextIteratorStreamer,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  )
23
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  MAX_MAX_NEW_TOKENS = 2048
25
  DEFAULT_MAX_NEW_TOKENS = 1024
26
  MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
27
 
28
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
29
+ print("Using device:", device)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
 
31
  MODEL_ID_M = "nvidia/Cosmos-Reason1-7B"
32
  processor_m = AutoProcessor.from_pretrained(MODEL_ID_M, trust_remote_code=True)
 
64
  torch_dtype=torch.float16
65
  ).to(device).eval()
66
 
67
+ MODEL_MAP = {
68
+ "Cosmos-Reason1-7B": (processor_m, model_m),
69
+ "docscopeOCR-7B-050425-exp": (processor_x, model_x),
70
+ "Captioner-7B-Qwen2.5VL": (processor_z, model_z),
71
+ "visionOCR-3B": (processor_v, model_v),
72
+ }
73
+
74
+ MODEL_CHOICES = list(MODEL_MAP.keys())
75
+
76
+ image_examples = [
77
+ {"query": "Perform OCR on the text in the image.", "media": "images/1.jpg", "model": "docscopeOCR-7B-050425-exp", "mode": "image"},
78
+ {"query": "Explain the scene in detail.", "media": "images/2.jpg", "model": "Cosmos-Reason1-7B", "mode": "image"},
79
+ ]
80
+
81
+ video_examples = [
82
+ {"query": "Explain the Ad in Detail", "media": "videos/1.mp4", "model": "Captioner-7B-Qwen2.5VL", "mode": "video"},
83
+ {"query": "Identify the main actions in the video", "media": "videos/2.mp4", "model": "visionOCR-3B", "mode": "video"},
84
+ ]
85
+
86
+ all_examples = image_examples + video_examples
87
+
88
+
89
+ def pil_to_data_url(img: Image.Image, fmt="PNG"):
90
+ buf = BytesIO()
91
+ img.save(buf, format=fmt)
92
+ data = base64.b64encode(buf.getvalue()).decode()
93
+ mime = "image/png" if fmt.upper() == "PNG" else "image/jpeg"
94
+ return f"data:{mime};base64,{data}"
95
+
96
+
97
+ def file_to_data_url(path):
98
+ if not os.path.exists(path):
99
+ return ""
100
+ ext = path.rsplit(".", 1)[-1].lower()
101
+ mime = {
102
+ "jpg": "image/jpeg",
103
+ "jpeg": "image/jpeg",
104
+ "png": "image/png",
105
+ "webp": "image/webp",
106
+ "mp4": "video/mp4",
107
+ "mov": "video/quicktime",
108
+ "webm": "video/webm",
109
+ }.get(ext, "application/octet-stream")
110
+ with open(path, "rb") as f:
111
+ data = base64.b64encode(f.read()).decode()
112
+ return f"data:{mime};base64,{data}"
113
+
114
+
115
+ def make_thumb_b64(path, mode="image", max_dim=240):
116
+ try:
117
+ if mode == "video":
118
+ cap = cv2.VideoCapture(path)
119
+ ok, frame = cap.read()
120
+ cap.release()
121
+ if not ok:
122
+ return ""
123
+ frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
124
+ img = Image.fromarray(frame).convert("RGB")
125
+ else:
126
+ img = Image.open(path).convert("RGB")
127
+ img.thumbnail((max_dim, max_dim))
128
+ return pil_to_data_url(img, "JPEG")
129
+ except Exception as e:
130
+ print("Thumbnail error:", e)
131
+ return ""
132
+
133
+
134
+ def build_example_cards_html():
135
+ cards = ""
136
+ for i, ex in enumerate(all_examples):
137
+ thumb = make_thumb_b64(ex["media"], ex["mode"])
138
+ prompt_short = ex["query"][:72] + ("..." if len(ex["query"]) > 72 else "")
139
+ media_badge = "VIDEO" if ex["mode"] == "video" else "IMAGE"
140
+ cards += f"""
141
+ <div class="example-card" data-idx="{i}">
142
+ <div class="example-thumb-wrap">
143
+ {"<img src='" + thumb + "' alt=''>" if thumb else "<div class='example-thumb-placeholder'>Preview</div>"}
144
+ <div class="example-media-chip">{media_badge}</div>
145
+ </div>
146
+ <div class="example-meta-row">
147
+ <span class="example-badge">{ex["model"]}</span>
148
+ </div>
149
+ <div class="example-prompt-text">{prompt_short}</div>
150
+ </div>
151
+ """
152
+ return cards
153
+
154
+
155
+ EXAMPLE_CARDS_HTML = build_example_cards_html()
156
+
157
+
158
+ def load_example_data(idx_str):
159
+ try:
160
+ idx = int(float(idx_str))
161
+ except Exception:
162
+ return json.dumps({"status": "error", "message": "Invalid example index"})
163
+ if idx < 0 or idx >= len(all_examples):
164
+ return json.dumps({"status": "error", "message": "Example index out of range"})
165
+ ex = all_examples[idx]
166
+ media_b64 = file_to_data_url(ex["media"])
167
+ if not media_b64:
168
+ return json.dumps({"status": "error", "message": f"Could not load example {ex['mode']}"})
169
+ return json.dumps({
170
+ "status": "ok",
171
+ "query": ex["query"],
172
+ "media": media_b64,
173
+ "model": ex["model"],
174
+ "mode": ex["mode"],
175
+ "name": os.path.basename(ex["media"]),
176
+ })
177
+
178
+
179
+ def b64_to_pil(b64_str):
180
+ if not b64_str:
181
+ return None
182
+ try:
183
+ if b64_str.startswith("data:"):
184
+ _, data = b64_str.split(",", 1)
185
+ else:
186
+ data = b64_str
187
+ image_data = base64.b64decode(data)
188
+ return Image.open(BytesIO(image_data)).convert("RGB")
189
+ except Exception:
190
+ return None
191
+
192
+
193
+ def b64_to_temp_video(b64_str):
194
+ if not b64_str:
195
+ return None
196
+ try:
197
+ if b64_str.startswith("data:"):
198
+ header, data = b64_str.split(",", 1)
199
+ mime = header.split(";")[0].replace("data:", "")
200
+ else:
201
+ data = b64_str
202
+ mime = "video/mp4"
203
+ ext = {
204
+ "video/mp4": ".mp4",
205
+ "video/webm": ".webm",
206
+ "video/quicktime": ".mov",
207
+ }.get(mime, ".mp4")
208
+ raw = base64.b64decode(data)
209
+ temp_dir = os.path.join("/tmp", "docscope_r1_media")
210
+ os.makedirs(temp_dir, exist_ok=True)
211
+ path = os.path.join(temp_dir, f"{uuid.uuid4().hex}{ext}")
212
+ with open(path, "wb") as f:
213
+ f.write(raw)
214
+ return path
215
+ except Exception:
216
+ return None
217
+
218
+
219
  def downsample_video(video_path):
 
 
 
 
220
  vidcap = cv2.VideoCapture(video_path)
221
  total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
222
+ fps = vidcap.get(cv2.CAP_PROP_FPS) or 1.0
223
  frames = []
224
+ frame_count = min(total_frames, 10) if total_frames > 0 else 0
225
+ if frame_count == 0:
226
+ vidcap.release()
227
+ return frames
228
+ frame_indices = np.linspace(0, total_frames - 1, frame_count, dtype=int)
229
  for i in frame_indices:
230
+ vidcap.set(cv2.CAP_PROP_POS_FRAMES, int(i))
231
  success, image = vidcap.read()
232
  if success:
233
  image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
234
  pil_image = Image.fromarray(image)
235
+ timestamp = round(float(i) / float(fps), 2)
236
  frames.append((pil_image, timestamp))
237
  vidcap.release()
238
  return frames
239
 
240
+
241
+ def calc_timeout_image(model_name, text, image, max_new_tokens, temperature, top_p, top_k, repetition_penalty, gpu_timeout):
 
 
242
  try:
243
  return int(gpu_timeout)
244
+ except Exception:
245
  return 60
246
 
247
+
248
+ def calc_timeout_video(model_name, text, video_path, max_new_tokens, temperature, top_p, top_k, repetition_penalty, gpu_timeout):
 
 
249
  try:
250
  return int(gpu_timeout)
251
+ except Exception:
252
  return 60
253
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
254
 
255
+ @spaces.GPU(duration=calc_timeout_image)
256
+ def generate_image(model_name, text, image, max_new_tokens=1024, temperature=0.6, top_p=0.9, top_k=50, repetition_penalty=1.2, gpu_timeout=60):
257
+ if not model_name or model_name not in MODEL_MAP:
258
+ raise gr.Error("Please select a valid model.")
259
  if image is None:
260
+ raise gr.Error("Please upload an image.")
261
+ if not text or not str(text).strip():
262
+ raise gr.Error("Please enter your instruction.")
263
+ if len(str(text)) > MAX_INPUT_TOKEN_LENGTH * 8:
264
+ raise gr.Error("Query is too long. Please shorten your input.")
265
+
266
+ processor, model = MODEL_MAP[model_name]
267
 
268
  messages = [{
269
  "role": "user",
 
272
  {"type": "text", "text": text},
273
  ]
274
  }]
275
+
276
+ prompt_full = processor.apply_chat_template(
277
+ messages,
278
+ tokenize=False,
279
+ add_generation_prompt=True
280
+ )
281
+
282
  inputs = processor(
283
  text=[prompt_full],
284
  images=[image],
 
287
  truncation=True,
288
  max_length=MAX_INPUT_TOKEN_LENGTH
289
  ).to(device)
290
+
291
  streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
292
+ generation_kwargs = {
293
+ **inputs,
294
+ "streamer": streamer,
295
+ "max_new_tokens": int(max_new_tokens),
296
+ "do_sample": True,
297
+ "temperature": float(temperature),
298
+ "top_p": float(top_p),
299
+ "top_k": int(top_k),
300
+ "repetition_penalty": float(repetition_penalty),
301
+ }
302
+
303
  thread = Thread(target=model.generate, kwargs=generation_kwargs)
304
  thread.start()
305
+
306
  buffer = ""
307
  for new_text in streamer:
308
+ buffer += new_text.replace("<|im_end|>", "")
309
  time.sleep(0.01)
310
+ yield buffer
311
 
312
+ gc.collect()
313
+ if torch.cuda.is_available():
314
+ torch.cuda.empty_cache()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
315
 
 
 
 
316
 
317
+ @spaces.GPU(duration=calc_timeout_video)
318
+ def generate_video(model_name, text, video_path, max_new_tokens=1024, temperature=0.6, top_p=0.9, top_k=50, repetition_penalty=1.2, gpu_timeout=90):
319
+ if not model_name or model_name not in MODEL_MAP:
320
+ raise gr.Error("Please select a valid model.")
321
+ if not video_path:
322
+ raise gr.Error("Please upload a video.")
323
+ if not text or not str(text).strip():
324
+ raise gr.Error("Please enter your instruction.")
325
+ if len(str(text)) > MAX_INPUT_TOKEN_LENGTH * 8:
326
+ raise gr.Error("Query is too long. Please shorten your input.")
327
+
328
+ processor, model = MODEL_MAP[model_name]
329
  frames = downsample_video(video_path)
330
+ if not frames:
331
+ raise gr.Error("Could not read the uploaded video.")
332
+
333
  messages = [
334
  {"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]},
335
  {"role": "user", "content": [{"type": "text", "text": text}]}
336
  ]
337
+
338
+ for image, timestamp in frames:
339
  messages[1]["content"].append({"type": "text", "text": f"Frame {timestamp}:"})
340
  messages[1]["content"].append({"type": "image", "image": image})
341
+
342
  inputs = processor.apply_chat_template(
343
  messages,
344
  tokenize=True,
 
348
  truncation=True,
349
  max_length=MAX_INPUT_TOKEN_LENGTH
350
  ).to(device)
351
+
352
  streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
353
  generation_kwargs = {
354
  **inputs,
355
  "streamer": streamer,
356
+ "max_new_tokens": int(max_new_tokens),
357
  "do_sample": True,
358
+ "temperature": float(temperature),
359
+ "top_p": float(top_p),
360
+ "top_k": int(top_k),
361
+ "repetition_penalty": float(repetition_penalty),
362
  }
363
+
364
  thread = Thread(target=model.generate, kwargs=generation_kwargs)
365
  thread.start()
366
+
367
  buffer = ""
368
  for new_text in streamer:
369
+ buffer += new_text.replace("<|im_end|>", "")
370
  time.sleep(0.01)
371
+ yield buffer
372
 
373
+ gc.collect()
374
+ if torch.cuda.is_available():
375
+ torch.cuda.empty_cache()
 
376
 
 
 
 
 
377
 
378
+ def run_inference(mode, model_name, text, image_b64, video_b64, max_new_tokens_v, temperature_v, top_p_v, top_k_v, repetition_penalty_v, gpu_timeout_v):
379
+ if mode == "video":
380
+ temp_video_path = b64_to_temp_video(video_b64)
381
+ if not temp_video_path:
382
+ raise gr.Error("Could not decode uploaded video.")
383
+ try:
384
+ yield from generate_video(
385
+ model_name=model_name,
386
+ text=text,
387
+ video_path=temp_video_path,
388
+ max_new_tokens=max_new_tokens_v,
389
+ temperature=temperature_v,
390
+ top_p=top_p_v,
391
+ top_k=top_k_v,
392
+ repetition_penalty=repetition_penalty_v,
393
+ gpu_timeout=gpu_timeout_v,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
394
  )
395
+ finally:
396
+ try:
397
+ os.remove(temp_video_path)
398
+ except Exception:
399
+ pass
400
+ else:
401
+ image = b64_to_pil(image_b64)
402
+ yield from generate_image(
403
+ model_name=model_name,
404
+ text=text,
405
+ image=image,
406
+ max_new_tokens=max_new_tokens_v,
407
+ temperature=temperature_v,
408
+ top_p=top_p_v,
409
+ top_k=top_k_v,
410
+ repetition_penalty=repetition_penalty_v,
411
+ gpu_timeout=gpu_timeout_v,
412
+ )
413
+
414
+
415
+ def noop():
416
+ return None
417
+
418
+
419
+ css = r"""
420
+ @import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700;800&family=JetBrains+Mono:wght@400;500;600&display=swap');
421
+ *{box-sizing:border-box;margin:0;padding:0}
422
+ html,body{height:100%;overflow-x:hidden}
423
+ body,.gradio-container{
424
+ background:#0f0f13!important;
425
+ font-family:'Inter',system-ui,-apple-system,sans-serif!important;
426
+ font-size:14px!important;color:#e4e4e7!important;min-height:100vh;overflow-x:hidden;
427
+ }
428
+ .dark body,.dark .gradio-container{background:#0f0f13!important;color:#e4e4e7!important}
429
+ footer{display:none!important}
430
+ .hidden-input{display:none!important;height:0!important;overflow:hidden!important;margin:0!important;padding:0!important}
431
+
432
+ #gradio-run-btn,#example-load-btn{
433
+ position:absolute!important;left:-9999px!important;top:-9999px!important;
434
+ width:1px!important;height:1px!important;opacity:0.01!important;
435
+ pointer-events:none!important;overflow:hidden!important;
436
+ }
437
+
438
+ .app-shell{
439
+ background:#18181b;border:1px solid #27272a;border-radius:16px;
440
+ margin:12px auto;max-width:1400px;overflow:hidden;
441
+ box-shadow:0 25px 50px -12px rgba(0,0,0,.6),0 0 0 1px rgba(255,255,255,.03);
442
+ }
443
+ .app-header{
444
+ background:linear-gradient(135deg,#18181b,#1e1e24);border-bottom:1px solid #27272a;
445
+ padding:14px 24px;display:flex;align-items:center;justify-content:space-between;flex-wrap:wrap;gap:12px;
446
+ }
447
+ .app-header-left{display:flex;align-items:center;gap:12px}
448
+ .app-logo{
449
+ width:38px;height:38px;background:linear-gradient(135deg,#FF1493,#E1007A,#FF4DB2);
450
+ border-radius:10px;display:flex;align-items:center;justify-content:center;
451
+ box-shadow:0 4px 12px rgba(255,20,147,.35);
452
+ }
453
+ .app-logo svg{width:22px;height:22px;fill:#fff;flex-shrink:0}
454
+ .app-title{
455
+ font-size:18px;font-weight:700;background:linear-gradient(135deg,#f5f5f5,#bdbdbd);
456
+ -webkit-background-clip:text;-webkit-text-fill-color:transparent;letter-spacing:-.3px;
457
+ }
458
+ .app-badge{
459
+ font-size:11px;font-weight:600;padding:3px 10px;border-radius:20px;
460
+ background:rgba(255,20,147,.12);color:#ff7ac7;border:1px solid rgba(255,20,147,.25);letter-spacing:.3px;
461
+ }
462
+ .app-badge.fast{background:rgba(225,0,122,.10);color:#ff66be;border:1px solid rgba(225,0,122,.22)}
463
+
464
+ .model-tabs-bar{
465
+ background:#18181b;border-bottom:1px solid #27272a;padding:10px 16px;
466
+ display:flex;gap:8px;align-items:center;flex-wrap:wrap;
467
+ }
468
+ .model-tab{
469
+ display:inline-flex;align-items:center;justify-content:center;gap:6px;
470
+ min-width:32px;height:34px;background:transparent;border:1px solid #27272a;
471
+ border-radius:999px;cursor:pointer;font-size:12px;font-weight:600;padding:0 12px;
472
+ color:#ffffff!important;transition:all .15s ease;
473
+ }
474
+ .model-tab:hover{background:rgba(255,20,147,.12);border-color:rgba(255,20,147,.35)}
475
+ .model-tab.active{background:rgba(255,20,147,.22);border-color:#FF1493;color:#fff!important;box-shadow:0 0 0 2px rgba(255,20,147,.10)}
476
+ .model-tab-label{font-size:12px;color:#ffffff!important;font-weight:600}
477
 
478
+ .mode-tabs-bar{
479
+ background:#18181b;border-bottom:1px solid #27272a;padding:10px 16px 12px;
480
+ display:flex;gap:8px;align-items:center;flex-wrap:wrap;
481
+ }
482
+ .mode-tab{
483
+ display:inline-flex;align-items:center;justify-content:center;gap:6px;
484
+ min-width:110px;height:34px;background:transparent;border:1px solid #27272a;
485
+ border-radius:999px;cursor:pointer;font-size:12px;font-weight:700;padding:0 14px;
486
+ color:#ffffff!important;transition:all .15s ease;text-transform:uppercase;letter-spacing:.5px;
487
+ }
488
+ .mode-tab:hover{background:rgba(255,20,147,.12);border-color:rgba(255,20,147,.35)}
489
+ .mode-tab.active{background:rgba(255,20,147,.22);border-color:#FF1493;color:#fff!important;box-shadow:0 0 0 2px rgba(255,20,147,.10)}
490
+
491
+ .app-main-row{display:flex;gap:0;flex:1;overflow:hidden}
492
+ .app-main-left{flex:1;display:flex;flex-direction:column;min-width:0;border-right:1px solid #27272a}
493
+ .app-main-right{width:470px;display:flex;flex-direction:column;flex-shrink:0;background:#18181b}
494
+
495
+ #media-drop-zone{
496
+ position:relative;background:#09090b;height:440px;min-height:440px;max-height:440px;
497
+ overflow:hidden;
498
+ }
499
+ #media-drop-zone.drag-over{outline:2px solid #FF1493;outline-offset:-2px;background:rgba(255,20,147,.04)}
500
+ .upload-prompt-modern{
501
+ position:absolute;inset:0;display:flex;align-items:center;justify-content:center;
502
+ padding:20px;z-index:20;overflow:hidden;
503
+ }
504
+ .upload-click-area{
505
+ display:flex;flex-direction:column;align-items:center;justify-content:center;
506
+ cursor:pointer;padding:28px 36px;max-width:92%;max-height:92%;
507
+ border:2px dashed #3f3f46;border-radius:16px;
508
+ background:rgba(255,20,147,.03);transition:all .2s ease;gap:8px;text-align:center;
509
+ overflow:hidden;
510
+ }
511
+ .upload-click-area:hover{background:rgba(255,20,147,.08);border-color:#FF1493;transform:scale(1.02)}
512
+ .upload-click-area:active{background:rgba(255,20,147,.12);transform:scale(.99)}
513
+ .upload-click-area svg{width:86px;height:86px;max-width:100%;flex-shrink:0}
514
+ .upload-main-text{color:#a1a1aa;font-size:14px;font-weight:600;margin-top:4px}
515
+ .upload-sub-text{color:#71717a;font-size:12px}
516
+
517
+ .single-preview-wrap{
518
+ width:100%;height:100%;display:none;align-items:center;justify-content:center;padding:16px;
519
+ overflow:hidden;
520
+ }
521
+ .single-preview-card{
522
+ width:100%;height:100%;max-width:100%;max-height:100%;border-radius:14px;
523
+ overflow:hidden;border:1px solid #27272a;background:#111114;
524
+ display:flex;align-items:center;justify-content:center;position:relative;
525
+ }
526
+ .single-preview-card img,.single-preview-card video{
527
+ width:100%;height:100%;max-width:100%;max-height:100%;
528
+ object-fit:contain;display:block;background:#000;
529
+ }
530
+ .preview-overlay-actions{
531
+ position:absolute;top:12px;right:12px;display:flex;gap:8px;z-index:5;
532
+ }
533
+ .preview-action-btn{
534
+ display:inline-flex;align-items:center;justify-content:center;
535
+ min-width:34px;height:34px;padding:0 12px;background:rgba(0,0,0,.65);
536
+ border:1px solid rgba(255,255,255,.14);border-radius:10px;cursor:pointer;
537
+ color:#fff!important;font-size:12px;font-weight:600;transition:all .15s ease;
538
+ }
539
+ .preview-action-btn:hover{background:#FF1493;border-color:#FF1493}
540
+
541
+ .hint-bar{
542
+ background:rgba(255,20,147,.06);border-top:1px solid #27272a;border-bottom:1px solid #27272a;
543
+ padding:10px 20px;font-size:13px;color:#a1a1aa;line-height:1.7;
544
+ }
545
+ .hint-bar b{color:#ff7ac7;font-weight:600}
546
+ .hint-bar kbd{
547
+ display:inline-block;padding:1px 6px;background:#27272a;border:1px solid #3f3f46;
548
+ border-radius:4px;font-family:'JetBrains Mono',monospace;font-size:11px;color:#a1a1aa;
549
+ }
550
+
551
+ .examples-section{border-top:1px solid #27272a;padding:12px 16px}
552
+ .examples-title{
553
+ font-size:12px;font-weight:600;color:#71717a;text-transform:uppercase;
554
+ letter-spacing:.8px;margin-bottom:10px;
555
+ }
556
+ .examples-scroll{display:flex;gap:10px;overflow-x:auto;padding-bottom:8px}
557
+ .examples-scroll::-webkit-scrollbar{height:6px}
558
+ .examples-scroll::-webkit-scrollbar-track{background:#09090b;border-radius:3px}
559
+ .examples-scroll::-webkit-scrollbar-thumb{background:#27272a;border-radius:3px}
560
+ .examples-scroll::-webkit-scrollbar-thumb:hover{background:#3f3f46}
561
+ .example-card{
562
+ position:relative;
563
+ flex-shrink:0;width:220px;background:#09090b;border:1px solid #27272a;
564
+ border-radius:10px;overflow:hidden;cursor:pointer;transition:all .2s ease;
565
+ }
566
+ .example-card:hover{border-color:#FF1493;transform:translateY(-2px);box-shadow:0 4px 12px rgba(255,20,147,.15)}
567
+ .example-card.loading{opacity:.5;pointer-events:none}
568
+ .example-thumb-wrap{height:120px;overflow:hidden;background:#18181b;position:relative}
569
+ .example-thumb-wrap img{width:100%;height:100%;object-fit:cover}
570
+ .example-media-chip{
571
+ position:absolute;top:8px;left:8px;
572
+ display:inline-flex;padding:3px 7px;background:rgba(0,0,0,.7);border:1px solid rgba(255,255,255,.12);
573
+ border-radius:999px;font-size:10px;font-weight:700;color:#fff;letter-spacing:.5px;
574
+ }
575
+ .example-thumb-placeholder{
576
+ width:100%;height:100%;display:flex;align-items:center;justify-content:center;
577
+ background:#18181b;color:#3f3f46;font-size:11px;
578
+ }
579
+ .example-meta-row{padding:6px 10px;display:flex;align-items:center;gap:6px}
580
+ .example-badge{
581
+ display:inline-flex;padding:2px 7px;background:rgba(255,20,147,.12);border-radius:4px;
582
+ font-size:10px;font-weight:600;color:#ff7ac7;font-family:'JetBrains Mono',monospace;white-space:nowrap;
583
+ }
584
+ .example-prompt-text{
585
+ padding:0 10px 8px;font-size:11px;color:#a1a1aa;line-height:1.4;
586
+ display:-webkit-box;-webkit-line-clamp:2;-webkit-box-orient:vertical;overflow:hidden;
587
+ }
588
+
589
+ .panel-card{border-bottom:1px solid #27272a}
590
+ .panel-card-title{
591
+ padding:12px 20px;font-size:12px;font-weight:600;color:#71717a;
592
+ text-transform:uppercase;letter-spacing:.8px;border-bottom:1px solid rgba(39,39,42,.6);
593
+ }
594
+ .panel-card-body{padding:16px 20px;display:flex;flex-direction:column;gap:8px}
595
+ .modern-label{font-size:13px;font-weight:500;color:#a1a1aa;margin-bottom:4px;display:block}
596
+ .modern-textarea{
597
+ width:100%;background:#09090b;border:1px solid #27272a;border-radius:8px;
598
+ padding:10px 14px;font-family:'Inter',sans-serif;font-size:14px;color:#e4e4e7;
599
+ resize:none;outline:none;min-height:100px;transition:border-color .2s;
600
+ }
601
+ .modern-textarea:focus{border-color:#FF1493;box-shadow:0 0 0 3px rgba(255,20,147,.15)}
602
+ .modern-textarea::placeholder{color:#3f3f46}
603
+ .modern-textarea.error-flash{
604
+ border-color:#ef4444!important;box-shadow:0 0 0 3px rgba(239,68,68,.2)!important;animation:shake .4s ease;
605
+ }
606
+ @keyframes shake{0%,100%{transform:translateX(0)}20%,60%{transform:translateX(-4px)}40%,80%{transform:translateX(4px)}}
607
+
608
+ .toast-notification{
609
+ position:fixed;top:24px;left:50%;transform:translateX(-50%) translateY(-120%);
610
+ z-index:9999;padding:10px 24px;border-radius:10px;font-family:'Inter',sans-serif;
611
+ font-size:14px;font-weight:600;display:flex;align-items:center;gap:8px;
612
+ box-shadow:0 8px 24px rgba(0,0,0,.5);
613
+ transition:transform .35s cubic-bezier(.34,1.56,.64,1),opacity .35s ease;opacity:0;pointer-events:none;
614
+ }
615
+ .toast-notification.visible{transform:translateX(-50%) translateY(0);opacity:1;pointer-events:auto}
616
+ .toast-notification.error{background:linear-gradient(135deg,#dc2626,#b91c1c);color:#fff;border:1px solid rgba(255,255,255,.15)}
617
+ .toast-notification.warning{background:linear-gradient(135deg,#d97706,#b45309);color:#fff;border:1px solid rgba(255,255,255,.15)}
618
+ .toast-notification.info{background:linear-gradient(135deg,#ec4899,#be185d);color:#fff;border:1px solid rgba(255,255,255,.15)}
619
+ .toast-notification .toast-icon{font-size:16px;line-height:1}
620
+ .toast-notification .toast-text{line-height:1.3}
621
+
622
+ .btn-run{
623
+ display:flex;align-items:center;justify-content:center;gap:8px;width:100%;
624
+ background:linear-gradient(135deg,#FF1493,#D10073);border:none;border-radius:10px;
625
+ padding:12px 24px;cursor:pointer;font-size:15px;font-weight:600;font-family:'Inter',sans-serif;
626
+ color:#ffffff!important;-webkit-text-fill-color:#ffffff!important;
627
+ transition:all .2s ease;letter-spacing:-.2px;
628
+ box-shadow:0 4px 16px rgba(255,20,147,.3),inset 0 1px 0 rgba(255,255,255,.1);
629
+ }
630
+ .btn-run:hover{
631
+ background:linear-gradient(135deg,#ff4db2,#FF1493);transform:translateY(-1px);
632
+ box-shadow:0 6px 24px rgba(255,20,147,.45),inset 0 1px 0 rgba(255,255,255,.15);
633
+ }
634
+ .btn-run:active{transform:translateY(0);box-shadow:0 2px 8px rgba(255,20,147,.3)}
635
+ #custom-run-btn,#custom-run-btn *,#run-btn-label,.btn-run,.btn-run *{
636
+ color:#ffffff!important;-webkit-text-fill-color:#ffffff!important;fill:#ffffff!important;
637
+ }
638
+
639
+ .output-frame{border-bottom:1px solid #27272a;display:flex;flex-direction:column;position:relative}
640
+ .output-frame .out-title,
641
+ .output-frame .out-title *,
642
+ #output-title-label{
643
+ color:#ffffff!important;
644
+ -webkit-text-fill-color:#ffffff!important;
645
+ }
646
+ .output-frame .out-title{
647
+ padding:10px 20px;font-size:13px;font-weight:700;
648
+ text-transform:uppercase;letter-spacing:.8px;border-bottom:1px solid rgba(39,39,42,.6);
649
+ display:flex;align-items:center;justify-content:space-between;gap:8px;flex-wrap:wrap;
650
+ }
651
+ .out-title-right{display:flex;gap:8px;align-items:center}
652
+ .out-action-btn{
653
+ display:inline-flex;align-items:center;justify-content:center;background:rgba(255,20,147,.1);
654
+ border:1px solid rgba(255,20,147,.2);border-radius:6px;cursor:pointer;padding:3px 10px;
655
+ font-size:11px;font-weight:500;color:#ff7ac7!important;gap:4px;height:24px;transition:all .15s;
656
+ }
657
+ .out-action-btn:hover{background:rgba(255,20,147,.2);border-color:rgba(255,20,147,.35);color:#ffffff!important}
658
+ .out-action-btn svg{width:12px;height:12px;fill:#ff7ac7}
659
+ .output-frame .out-body{
660
+ flex:1;background:#09090b;display:flex;align-items:stretch;justify-content:stretch;
661
+ overflow:hidden;min-height:320px;position:relative;
662
+ }
663
+ .output-scroll-wrap{
664
+ width:100%;height:100%;padding:0;overflow:hidden;
665
+ }
666
+ .output-textarea{
667
+ width:100%;height:320px;min-height:320px;max-height:320px;background:#09090b;color:#e4e4e7;
668
+ border:none;outline:none;padding:16px 18px;font-size:13px;line-height:1.6;
669
+ font-family:'JetBrains Mono',monospace;overflow:auto;resize:none;white-space:pre-wrap;
670
+ }
671
+ .output-textarea::placeholder{color:#52525b}
672
+ .output-textarea.error-flash{
673
+ box-shadow:inset 0 0 0 2px rgba(239,68,68,.6);
674
+ }
675
+ .modern-loader{
676
+ display:none;position:absolute;top:0;left:0;right:0;bottom:0;background:rgba(9,9,11,.92);
677
+ z-index:15;flex-direction:column;align-items:center;justify-content:center;gap:16px;backdrop-filter:blur(4px);
678
+ }
679
+ .modern-loader.active{display:flex}
680
+ .modern-loader .loader-spinner{
681
+ width:36px;height:36px;border:3px solid #27272a;border-top-color:#FF1493;
682
+ border-radius:50%;animation:spin .8s linear infinite;
683
+ }
684
+ @keyframes spin{to{transform:rotate(360deg)}}
685
+ .modern-loader .loader-text{font-size:13px;color:#a1a1aa;font-weight:500}
686
+ .loader-bar-track{width:200px;height:4px;background:#27272a;border-radius:2px;overflow:hidden}
687
+ .loader-bar-fill{
688
+ height:100%;background:linear-gradient(90deg,#FF1493,#FF69C8,#FF1493);
689
+ background-size:200% 100%;animation:shimmer 1.5s ease-in-out infinite;border-radius:2px;
690
+ }
691
+ @keyframes shimmer{0%{background-position:200% 0}100%{background-position:-200% 0}}
692
+
693
+ .settings-group{border:1px solid #27272a;border-radius:10px;margin:12px 16px;padding:0;overflow:hidden}
694
+ .settings-group-title{
695
+ font-size:12px;font-weight:600;color:#71717a;text-transform:uppercase;letter-spacing:.8px;
696
+ padding:10px 16px;border-bottom:1px solid #27272a;background:rgba(24,24,27,.5);
697
+ }
698
+ .settings-group-body{padding:14px 16px;display:flex;flex-direction:column;gap:12px}
699
+ .slider-row{display:flex;align-items:center;gap:10px;min-height:28px}
700
+ .slider-row label{font-size:13px;font-weight:500;color:#a1a1aa;min-width:118px;flex-shrink:0}
701
+ .slider-row input[type="range"]{
702
+ flex:1;-webkit-appearance:none;appearance:none;height:6px;background:#27272a;
703
+ border-radius:3px;outline:none;min-width:0;
704
+ }
705
+ .slider-row input[type="range"]::-webkit-slider-thumb{
706
+ -webkit-appearance:none;width:16px;height:16px;background:linear-gradient(135deg,#FF1493,#D10073);
707
+ border-radius:50%;cursor:pointer;box-shadow:0 2px 6px rgba(255,20,147,.4);transition:transform .15s;
708
+ }
709
+ .slider-row input[type="range"]::-webkit-slider-thumb:hover{transform:scale(1.2)}
710
+ .slider-row input[type="range"]::-moz-range-thumb{
711
+ width:16px;height:16px;background:linear-gradient(135deg,#FF1493,#D10073);
712
+ border-radius:50%;cursor:pointer;border:none;box-shadow:0 2px 6px rgba(255,20,147,.4);
713
+ }
714
+ .slider-row .slider-val{
715
+ min-width:58px;text-align:right;font-family:'JetBrains Mono',monospace;font-size:12px;
716
+ font-weight:500;padding:3px 8px;background:#09090b;border:1px solid #27272a;
717
+ border-radius:6px;color:#a1a1aa;flex-shrink:0;
718
+ }
719
+
720
+ .app-statusbar{
721
+ background:#18181b;border-top:1px solid #27272a;padding:6px 20px;
722
+ display:flex;gap:12px;height:34px;align-items:center;font-size:12px;
723
+ }
724
+ .app-statusbar .sb-section{
725
+ padding:0 12px;flex:1;display:flex;align-items:center;font-family:'JetBrains Mono',monospace;
726
+ font-size:12px;color:#52525b;overflow:hidden;white-space:nowrap;
727
+ }
728
+ .app-statusbar .sb-section.sb-fixed{
729
+ flex:0 0 auto;min-width:110px;text-align:center;justify-content:center;
730
+ padding:3px 12px;background:rgba(255,20,147,.08);border-radius:6px;color:#ff7ac7;font-weight:500;
731
+ }
732
+
733
+ .exp-note{padding:10px 20px;font-size:12px;color:#52525b;border-top:1px solid #27272a;text-align:center}
734
+ .exp-note a{color:#ff7ac7;text-decoration:none}
735
+ .exp-note a:hover{text-decoration:underline}
736
+
737
+ ::-webkit-scrollbar{width:8px;height:8px}
738
+ ::-webkit-scrollbar-track{background:#09090b}
739
+ ::-webkit-scrollbar-thumb{background:#27272a;border-radius:4px}
740
+ ::-webkit-scrollbar-thumb:hover{background:#3f3f46}
741
+
742
+ @media(max-width:980px){
743
+ .app-main-row{flex-direction:column}
744
+ .app-main-right{width:100%}
745
+ .app-main-left{border-right:none;border-bottom:1px solid #27272a}
746
+ }
747
+ """
748
+
749
+ gallery_js = r"""
750
+ () => {
751
+ function init() {
752
+ if (window.__docScopeInitDone) return;
753
+
754
+ const dropZone = document.getElementById('media-drop-zone');
755
+ const uploadPrompt = document.getElementById('upload-prompt');
756
+ const uploadClick = document.getElementById('upload-click-area');
757
+ const fileInput = document.getElementById('custom-file-input');
758
+ const previewWrap = document.getElementById('single-preview-wrap');
759
+ const previewImg = document.getElementById('single-preview-img');
760
+ const previewVideo = document.getElementById('single-preview-video');
761
+ const btnUpload = document.getElementById('preview-upload-btn');
762
+ const btnClear = document.getElementById('preview-clear-btn');
763
+ const promptInput = document.getElementById('custom-query-input');
764
+ const runBtnEl = document.getElementById('custom-run-btn');
765
+ const outputArea = document.getElementById('custom-output-textarea');
766
+ const mediaStatus = document.getElementById('sb-media-status');
767
+ const exampleResultContainer = document.getElementById('example-result-data');
768
+
769
+ if (!dropZone || !fileInput || !promptInput || !previewWrap || !previewImg || !previewVideo) {
770
+ setTimeout(init, 250);
771
+ return;
772
+ }
773
+
774
+ window.__docScopeInitDone = true;
775
+ let mediaState = null;
776
+ let currentMode = 'image';
777
+ let toastTimer = null;
778
+
779
+ function showToast(message, type) {
780
+ let toast = document.getElementById('app-toast');
781
+ if (!toast) {
782
+ toast = document.createElement('div');
783
+ toast.id = 'app-toast';
784
+ toast.className = 'toast-notification';
785
+ toast.innerHTML = '<span class="toast-icon"></span><span class="toast-text"></span>';
786
+ document.body.appendChild(toast);
787
+ }
788
+ const icon = toast.querySelector('.toast-icon');
789
+ const text = toast.querySelector('.toast-text');
790
+ toast.className = 'toast-notification ' + (type || 'error');
791
+ if (type === 'warning') icon.textContent = '\u26A0';
792
+ else if (type === 'info') icon.textContent = '\u2139';
793
+ else icon.textContent = '\u2717';
794
+ text.textContent = message;
795
+ if (toastTimer) clearTimeout(toastTimer);
796
+ void toast.offsetWidth;
797
+ toast.classList.add('visible');
798
+ toastTimer = setTimeout(() => toast.classList.remove('visible'), 3500);
799
+ }
800
+ window.__showToast = showToast;
801
+
802
+ function showLoader() {
803
+ const l = document.getElementById('output-loader');
804
+ if (l) l.classList.add('active');
805
+ const sb = document.getElementById('sb-run-state');
806
+ if (sb) sb.textContent = 'Processing...';
807
+ }
808
+ function hideLoader() {
809
+ const l = document.getElementById('output-loader');
810
+ if (l) l.classList.remove('active');
811
+ const sb = document.getElementById('sb-run-state');
812
+ if (sb) sb.textContent = 'Done';
813
+ }
814
+ window.__showLoader = showLoader;
815
+ window.__hideLoader = hideLoader;
816
+
817
+ function flashPromptError() {
818
+ promptInput.classList.add('error-flash');
819
+ promptInput.focus();
820
+ setTimeout(() => promptInput.classList.remove('error-flash'), 800);
821
+ }
822
+
823
+ function flashOutputError() {
824
+ if (!outputArea) return;
825
+ outputArea.classList.add('error-flash');
826
+ setTimeout(() => outputArea.classList.remove('error-flash'), 800);
827
+ }
828
+
829
+ function setGradioValue(containerId, value) {
830
+ const container = document.getElementById(containerId);
831
+ if (!container) return;
832
+ container.querySelectorAll('input, textarea').forEach(el => {
833
+ if (el.type === 'file' || el.type === 'range' || el.type === 'checkbox') return;
834
+ const proto = el.tagName === 'TEXTAREA' ? HTMLTextAreaElement.prototype : HTMLInputElement.prototype;
835
+ const ns = Object.getOwnPropertyDescriptor(proto, 'value');
836
+ if (ns && ns.set) {
837
+ ns.set.call(el, value);
838
+ el.dispatchEvent(new Event('input', {bubbles:true, composed:true}));
839
+ el.dispatchEvent(new Event('change', {bubbles:true, composed:true}));
840
+ }
841
+ });
842
+ }
843
+
844
+ function syncMediaToGradio() {
845
+ setGradioValue('hidden-image-b64', mediaState && mediaState.mode === 'image' ? mediaState.b64 : '');
846
+ setGradioValue('hidden-video-b64', mediaState && mediaState.mode === 'video' ? mediaState.b64 : '');
847
+ const txt = mediaState ? (`1 ${mediaState.mode} uploaded`) : `No ${currentMode} uploaded`;
848
+ if (mediaStatus) mediaStatus.textContent = txt;
849
+ }
850
+
851
+ function syncPromptToGradio() {
852
+ setGradioValue('prompt-gradio-input', promptInput.value);
853
+ }
854
+
855
+ function syncModelToGradio(name) {
856
+ setGradioValue('hidden-model-name', name);
857
+ }
858
+
859
+ function syncModeToGradio(mode) {
860
+ setGradioValue('hidden-mode-name', mode);
861
+ }
862
+
863
+ function renderPreview() {
864
+ if (!mediaState) {
865
+ previewImg.src = '';
866
+ previewVideo.src = '';
867
+ previewImg.style.display = 'none';
868
+ previewVideo.style.display = 'none';
869
+ previewWrap.style.display = 'none';
870
+ if (uploadPrompt) uploadPrompt.style.display = 'flex';
871
+ syncMediaToGradio();
872
+ return;
873
+ }
874
+
875
+ if (mediaState.mode === 'video') {
876
+ previewImg.src = '';
877
+ previewImg.style.display = 'none';
878
+ previewVideo.src = mediaState.b64;
879
+ previewVideo.style.display = 'block';
880
+ previewWrap.style.display = 'flex';
881
+ } else {
882
+ previewVideo.pause();
883
+ previewVideo.removeAttribute('src');
884
+ previewVideo.load();
885
+ previewVideo.style.display = 'none';
886
+ previewImg.src = mediaState.b64;
887
+ previewImg.style.display = 'block';
888
+ previewWrap.style.display = 'flex';
889
+ }
890
+ if (uploadPrompt) uploadPrompt.style.display = 'none';
891
+ syncMediaToGradio();
892
+ }
893
+
894
+ function setPreview(b64, name, mode) {
895
+ mediaState = {b64, name: name || 'file', mode: mode || currentMode};
896
+ renderPreview();
897
+ }
898
+ window.__setPreview = setPreview;
899
+
900
+ function clearPreview() {
901
+ mediaState = null;
902
+ renderPreview();
903
+ }
904
+ window.__clearPreview = clearPreview;
905
+
906
+ function processFile(file) {
907
+ if (!file) return;
908
+ if (currentMode === 'image' && !file.type.startsWith('image/')) {
909
+ showToast('Only image files are supported in Image mode', 'error');
910
+ return;
911
+ }
912
+ if (currentMode === 'video' && !file.type.startsWith('video/')) {
913
+ showToast('Only video files are supported in Video mode', 'error');
914
+ return;
915
+ }
916
+ const reader = new FileReader();
917
+ reader.onload = (e) => setPreview(e.target.result, file.name, currentMode);
918
+ reader.readAsDataURL(file);
919
+ }
920
+
921
+ fileInput.addEventListener('change', (e) => {
922
+ const file = e.target.files && e.target.files[0] ? e.target.files[0] : null;
923
+ if (file) processFile(file);
924
+ e.target.value = '';
925
+ });
926
+
927
+ function updateAccept() {
928
+ fileInput.accept = currentMode === 'video' ? 'video/*' : 'image/*';
929
+ const main = document.getElementById('upload-main-text');
930
+ const sub = document.getElementById('upload-sub-text');
931
+ if (main) main.textContent = currentMode === 'video' ? 'Click or drag a video here' : 'Click or drag an image here';
932
+ if (sub) sub.textContent = currentMode === 'video'
933
+ ? 'Upload one short video clip for document-aware video understanding'
934
+ : 'Upload one document, page, screenshot, receipt, or scene image for OCR and reasoning';
935
+ if (!mediaState && mediaStatus) mediaStatus.textContent = `No ${currentMode} uploaded`;
936
+ }
937
+
938
+ if (uploadClick) uploadClick.addEventListener('click', () => fileInput.click());
939
+ if (btnUpload) btnUpload.addEventListener('click', () => fileInput.click());
940
+ if (btnClear) btnClear.addEventListener('click', clearPreview);
941
+
942
+ dropZone.addEventListener('dragover', (e) => {
943
+ e.preventDefault();
944
+ dropZone.classList.add('drag-over');
945
+ });
946
+ dropZone.addEventListener('dragleave', (e) => {
947
+ e.preventDefault();
948
+ dropZone.classList.remove('drag-over');
949
+ });
950
+ dropZone.addEventListener('drop', (e) => {
951
+ e.preventDefault();
952
+ dropZone.classList.remove('drag-over');
953
+ if (e.dataTransfer.files && e.dataTransfer.files.length) processFile(e.dataTransfer.files[0]);
954
+ });
955
+
956
+ promptInput.addEventListener('input', syncPromptToGradio);
957
+
958
+ function activateModelTab(name) {
959
+ document.querySelectorAll('.model-tab[data-model]').forEach(btn => {
960
+ btn.classList.toggle('active', btn.getAttribute('data-model') === name);
961
+ });
962
+ syncModelToGradio(name);
963
+ }
964
+ window.__activateModelTab = activateModelTab;
965
+
966
+ function activateModeTab(mode) {
967
+ currentMode = mode;
968
+ document.querySelectorAll('.mode-tab[data-mode]').forEach(btn => {
969
+ btn.classList.toggle('active', btn.getAttribute('data-mode') === mode);
970
+ });
971
+ syncModeToGradio(mode);
972
+ updateAccept();
973
+ if (mediaState && mediaState.mode !== mode) clearPreview();
974
+ }
975
+ window.__activateModeTab = activateModeTab;
976
+
977
+ document.querySelectorAll('.model-tab[data-model]').forEach(btn => {
978
+ btn.addEventListener('click', () => activateModelTab(btn.getAttribute('data-model')));
979
+ });
980
+ document.querySelectorAll('.mode-tab[data-mode]').forEach(btn => {
981
+ btn.addEventListener('click', () => activateModeTab(btn.getAttribute('data-mode')));
982
+ });
983
+
984
+ activateModelTab('Cosmos-Reason1-7B');
985
+ activateModeTab('image');
986
+
987
+ function syncSlider(customId, gradioId) {
988
+ const slider = document.getElementById(customId);
989
+ const valSpan = document.getElementById(customId + '-val');
990
+ if (!slider) return;
991
+ slider.addEventListener('input', () => {
992
+ if (valSpan) valSpan.textContent = slider.value;
993
+ const container = document.getElementById(gradioId);
994
+ if (!container) return;
995
+ container.querySelectorAll('input[type="range"],input[type="number"]').forEach(el => {
996
+ const ns = Object.getOwnPropertyDescriptor(HTMLInputElement.prototype, 'value');
997
+ if (ns && ns.set) {
998
+ ns.set.call(el, slider.value);
999
+ el.dispatchEvent(new Event('input', {bubbles:true, composed:true}));
1000
+ el.dispatchEvent(new Event('change', {bubbles:true, composed:true}));
1001
+ }
1002
+ });
1003
+ });
1004
+ }
1005
+
1006
+ syncSlider('custom-max-new-tokens', 'gradio-max-new-tokens');
1007
+ syncSlider('custom-temperature', 'gradio-temperature');
1008
+ syncSlider('custom-top-p', 'gradio-top-p');
1009
+ syncSlider('custom-top-k', 'gradio-top-k');
1010
+ syncSlider('custom-repetition-penalty', 'gradio-repetition-penalty');
1011
+ syncSlider('custom-gpu-duration', 'gradio-gpu-duration');
1012
+
1013
+ function validateBeforeRun() {
1014
+ const promptVal = promptInput.value.trim();
1015
+ if (!mediaState && !promptVal) {
1016
+ showToast(`Please upload a ${currentMode} and enter your instruction`, 'error');
1017
+ flashPromptError();
1018
+ return false;
1019
+ }
1020
+ if (!mediaState) {
1021
+ showToast(`Please upload a ${currentMode}`, 'error');
1022
+ return false;
1023
+ }
1024
+ if (mediaState.mode !== currentMode) {
1025
+ showToast(`Uploaded media does not match ${currentMode} mode`, 'error');
1026
+ return false;
1027
+ }
1028
+ if (!promptVal) {
1029
+ showToast('Please enter your instruction', 'warning');
1030
+ flashPromptError();
1031
+ return false;
1032
+ }
1033
+ const currentModel = (document.querySelector('.model-tab.active') || {}).dataset?.model;
1034
+ if (!currentModel) {
1035
+ showToast('Please select a model', 'error');
1036
+ return false;
1037
+ }
1038
+ return true;
1039
+ }
1040
+
1041
+ window.__clickGradioRunBtn = function() {
1042
+ if (!validateBeforeRun()) return;
1043
+ syncPromptToGradio();
1044
+ syncMediaToGradio();
1045
+ const activeModel = document.querySelector('.model-tab.active');
1046
+ if (activeModel) syncModelToGradio(activeModel.getAttribute('data-model'));
1047
+ const activeMode = document.querySelector('.mode-tab.active');
1048
+ if (activeMode) syncModeToGradio(activeMode.getAttribute('data-mode'));
1049
+ if (outputArea) outputArea.value = '';
1050
+ showLoader();
1051
+ setTimeout(() => {
1052
+ const gradioBtn = document.getElementById('gradio-run-btn');
1053
+ if (!gradioBtn) return;
1054
+ const btn = gradioBtn.querySelector('button');
1055
+ if (btn) btn.click(); else gradioBtn.click();
1056
+ }, 180);
1057
+ };
1058
+
1059
+ if (runBtnEl) runBtnEl.addEventListener('click', () => window.__clickGradioRunBtn());
1060
+
1061
+ const copyBtn = document.getElementById('copy-output-btn');
1062
+ if (copyBtn) {
1063
+ copyBtn.addEventListener('click', async () => {
1064
+ try {
1065
+ const text = outputArea ? outputArea.value : '';
1066
+ if (!text.trim()) {
1067
+ showToast('No output to copy', 'warning');
1068
+ flashOutputError();
1069
+ return;
1070
+ }
1071
+ await navigator.clipboard.writeText(text);
1072
+ showToast('Output copied to clipboard', 'info');
1073
+ } catch(e) {
1074
+ showToast('Copy failed', 'error');
1075
+ }
1076
+ });
1077
+ }
1078
+
1079
+ const saveBtn = document.getElementById('save-output-btn');
1080
+ if (saveBtn) {
1081
+ saveBtn.addEventListener('click', () => {
1082
+ const text = outputArea ? outputArea.value : '';
1083
+ if (!text.trim()) {
1084
+ showToast('No output to save', 'warning');
1085
+ flashOutputError();
1086
+ return;
1087
+ }
1088
+ const blob = new Blob([text], {type: 'text/plain;charset=utf-8'});
1089
+ const a = document.createElement('a');
1090
+ a.href = URL.createObjectURL(blob);
1091
+ a.download = 'docscope_r1_output.txt';
1092
+ document.body.appendChild(a);
1093
+ a.click();
1094
+ setTimeout(() => {
1095
+ URL.revokeObjectURL(a.href);
1096
+ document.body.removeChild(a);
1097
+ }, 200);
1098
+ showToast('Output saved', 'info');
1099
+ });
1100
+ }
1101
+
1102
+ document.querySelectorAll('.example-card[data-idx]').forEach(card => {
1103
+ card.addEventListener('click', () => {
1104
+ const idx = card.getAttribute('data-idx');
1105
+ document.querySelectorAll('.example-card.loading').forEach(c => c.classList.remove('loading'));
1106
+ card.classList.add('loading');
1107
+ showToast('Loading example...', 'info');
1108
+ setGradioValue('example-result-data', '');
1109
+ setGradioValue('example-idx-input', idx);
1110
+ setTimeout(() => {
1111
+ const btn = document.getElementById('example-load-btn');
1112
+ if (btn) {
1113
+ const b = btn.querySelector('button');
1114
+ if (b) b.click(); else btn.click();
1115
+ }
1116
+ }, 150);
1117
+ setTimeout(() => card.classList.remove('loading'), 12000);
1118
+ });
1119
+ });
1120
+
1121
+ function checkExampleResult() {
1122
+ if (!exampleResultContainer) return;
1123
+ const el = exampleResultContainer.querySelector('textarea') || exampleResultContainer.querySelector('input');
1124
+ if (!el || !el.value) return;
1125
+ if (window.__lastExampleVal === el.value) return;
1126
+ try {
1127
+ const data = JSON.parse(el.value);
1128
+ if (data.status === 'ok') {
1129
+ window.__lastExampleVal = el.value;
1130
+ if (data.mode) activateModeTab(data.mode);
1131
+ if (data.media) setPreview(data.media, data.name || 'example', data.mode || 'image');
1132
+ if (data.query) {
1133
+ promptInput.value = data.query;
1134
+ syncPromptToGradio();
1135
+ }
1136
+ if (data.model) activateModelTab(data.model);
1137
+ document.querySelectorAll('.example-card.loading').forEach(c => c.classList.remove('loading'));
1138
+ showToast('Example loaded', 'info');
1139
+ } else if (data.status === 'error') {
1140
+ document.querySelectorAll('.example-card.loading').forEach(c => c.classList.remove('loading'));
1141
+ showToast(data.message || 'Failed to load example', 'error');
1142
+ }
1143
+ } catch(e) {}
1144
+ }
1145
+
1146
+ const obsExample = new MutationObserver(checkExampleResult);
1147
+ if (exampleResultContainer) {
1148
+ obsExample.observe(exampleResultContainer, {childList:true, subtree:true, characterData:true, attributes:true});
1149
+ }
1150
+ setInterval(checkExampleResult, 500);
1151
+
1152
+ if (outputArea) outputArea.value = '';
1153
+ const sb = document.getElementById('sb-run-state');
1154
+ if (sb) sb.textContent = 'Ready';
1155
+ if (mediaStatus) mediaStatus.textContent = 'No image uploaded';
1156
+ }
1157
+ init();
1158
+ }
1159
+ """
1160
+
1161
+ wire_outputs_js = r"""
1162
+ () => {
1163
+ function watchOutputs() {
1164
+ const resultContainer = document.getElementById('gradio-result');
1165
+ const outArea = document.getElementById('custom-output-textarea');
1166
+ if (!resultContainer || !outArea) { setTimeout(watchOutputs, 500); return; }
1167
+
1168
+ let lastText = '';
1169
+
1170
+ function syncOutput() {
1171
+ const el = resultContainer.querySelector('textarea') || resultContainer.querySelector('input');
1172
+ if (!el) return;
1173
+ const val = el.value || '';
1174
+ if (val !== lastText) {
1175
+ lastText = val;
1176
+ outArea.value = val;
1177
+ outArea.scrollTop = outArea.scrollHeight;
1178
+ if (window.__hideLoader && val.trim()) window.__hideLoader();
1179
+ }
1180
+ }
1181
+
1182
+ const observer = new MutationObserver(syncOutput);
1183
+ observer.observe(resultContainer, {childList:true, subtree:true, characterData:true, attributes:true});
1184
+ setInterval(syncOutput, 500);
1185
+ }
1186
+ watchOutputs();
1187
+ }
1188
+ """
1189
+
1190
+ DOC_LOGO_SVG = """
1191
+ <svg viewBox="0 0 24 24" xmlns="http://www.w3.org/2000/svg">
1192
+ <path d="M7 3h7l5 5v11a2 2 0 0 1-2 2H7a2 2 0 0 1-2-2V5a2 2 0 0 1 2-2Zm7 1.5V9h4.5" fill="none" stroke="white" stroke-width="1.8" stroke-linecap="round" stroke-linejoin="round"/>
1193
+ <path d="M9 12h6M9 15h6M9 18h4" fill="none" stroke="white" stroke-width="1.8" stroke-linecap="round"/>
1194
+ </svg>
1195
+ """
1196
+
1197
+ UPLOAD_PREVIEW_SVG = """
1198
+ <svg viewBox="0 0 80 80" fill="none" xmlns="http://www.w3.org/2000/svg">
1199
+ <rect x="8" y="14" width="64" height="52" rx="6" fill="none" stroke="#FF1493" stroke-width="2" stroke-dasharray="4 3"/>
1200
+ <polygon points="12,62 30,40 42,50 54,34 68,62" fill="rgba(255,20,147,0.15)" stroke="#FF1493" stroke-width="1.5"/>
1201
+ <circle cx="28" cy="30" r="6" fill="rgba(255,20,147,0.2)" stroke="#FF1493" stroke-width="1.5"/>
1202
+ </svg>
1203
+ """
1204
+
1205
+ COPY_SVG = """<svg viewBox="0 0 24 24" xmlns="http://www.w3.org/2000/svg"><path d="M16 1H4C2.9 1 2 1.9 2 3v12h2V3h12V1zm3 4H8C6.9 5 6 5.9 6 7v14c0 1.1.9 2 2 2h11c1.1 0 2-.9 2-2V7c0-1.1-.9-2-2-2zm0 16H8V7h11v14z"/></svg>"""
1206
+ SAVE_SVG = """<svg viewBox="0 0 24 24" xmlns="http://www.w3.org/2000/svg"><path d="M17 3H5a2 2 0 0 0-2 2v14a2 2 0 0 0 2 2h14a2 2 0 0 0 2-2V7l-4-4zM7 5h8v4H7V5zm12 14H5v-6h14v6z"/></svg>"""
1207
+
1208
+ MODEL_TABS_HTML = "".join([
1209
+ f'<button class="model-tab{" active" if m == "Cosmos-Reason1-7B" else ""}" data-model="{m}"><span class="model-tab-label">{m}</span></button>'
1210
+ for m in MODEL_CHOICES
1211
+ ])
1212
+
1213
+ MODE_TABS_HTML = """
1214
+ <button class="mode-tab active" data-mode="image">Image Inference</button>
1215
+ <button class="mode-tab" data-mode="video">Video Inference</button>
1216
+ """
1217
+
1218
+ with gr.Blocks() as demo:
1219
+ hidden_mode_name = gr.Textbox(value="image", elem_id="hidden-mode-name", elem_classes="hidden-input", container=False)
1220
+ hidden_image_b64 = gr.Textbox(value="", elem_id="hidden-image-b64", elem_classes="hidden-input", container=False)
1221
+ hidden_video_b64 = gr.Textbox(value="", elem_id="hidden-video-b64", elem_classes="hidden-input", container=False)
1222
+ prompt = gr.Textbox(value="", elem_id="prompt-gradio-input", elem_classes="hidden-input", container=False)
1223
+ hidden_model_name = gr.Textbox(value="Cosmos-Reason1-7B", elem_id="hidden-model-name", elem_classes="hidden-input", container=False)
1224
+
1225
+ max_new_tokens = gr.Slider(minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS, elem_id="gradio-max-new-tokens", elem_classes="hidden-input", container=False)
1226
+ temperature = gr.Slider(minimum=0.1, maximum=4.0, step=0.1, value=0.6, elem_id="gradio-temperature", elem_classes="hidden-input", container=False)
1227
+ top_p = gr.Slider(minimum=0.05, maximum=1.0, step=0.05, value=0.9, elem_id="gradio-top-p", elem_classes="hidden-input", container=False)
1228
+ top_k = gr.Slider(minimum=1, maximum=1000, step=1, value=50, elem_id="gradio-top-k", elem_classes="hidden-input", container=False)
1229
+ repetition_penalty = gr.Slider(minimum=1.0, maximum=2.0, step=0.05, value=1.2, elem_id="gradio-repetition-penalty", elem_classes="hidden-input", container=False)
1230
+ gpu_duration_state = gr.Number(value=60, elem_id="gradio-gpu-duration", elem_classes="hidden-input", container=False)
1231
+
1232
+ result = gr.Textbox(value="", elem_id="gradio-result", elem_classes="hidden-input", container=False)
1233
+
1234
+ example_idx = gr.Textbox(value="", elem_id="example-idx-input", elem_classes="hidden-input", container=False)
1235
+ example_result = gr.Textbox(value="", elem_id="example-result-data", elem_classes="hidden-input", container=False)
1236
+ example_load_btn = gr.Button("Load Example", elem_id="example-load-btn")
1237
+
1238
+ gr.HTML(f"""
1239
+ <div class="app-shell">
1240
+ <div class="app-header">
1241
+ <div class="app-header-left">
1242
+ <div class="app-logo">{DOC_LOGO_SVG}</div>
1243
+ <span class="app-title">DocScope R1</span>
1244
+ <span class="app-badge">vision enabled</span>
1245
+ <span class="app-badge fast">OCR + Reasoning</span>
1246
+ </div>
1247
+ </div>
1248
+
1249
+ <div class="model-tabs-bar">
1250
+ {MODEL_TABS_HTML}
1251
+ </div>
1252
+
1253
+ <div class="mode-tabs-bar">
1254
+ {MODE_TABS_HTML}
1255
+ </div>
1256
+
1257
+ <div class="app-main-row">
1258
+ <div class="app-main-left">
1259
+ <div id="media-drop-zone">
1260
+ <div id="upload-prompt" class="upload-prompt-modern">
1261
+ <div id="upload-click-area" class="upload-click-area">
1262
+ {UPLOAD_PREVIEW_SVG}
1263
+ <span id="upload-main-text" class="upload-main-text">Click or drag an image here</span>
1264
+ <span id="upload-sub-text" class="upload-sub-text">Upload one document, page, screenshot, receipt, or scene image for OCR and reasoning</span>
1265
+ </div>
1266
+ </div>
1267
+
1268
+ <input id="custom-file-input" type="file" accept="image/*" style="display:none;" />
1269
+
1270
+ <div id="single-preview-wrap" class="single-preview-wrap">
1271
+ <div class="single-preview-card">
1272
+ <img id="single-preview-img" src="" alt="Preview" style="display:none;">
1273
+ <video id="single-preview-video" controls playsinline style="display:none;"></video>
1274
+ <div class="preview-overlay-actions">
1275
+ <button id="preview-upload-btn" class="preview-action-btn" title="Replace">Upload</button>
1276
+ <button id="preview-clear-btn" class="preview-action-btn" title="Clear">Clear</button>
1277
+ </div>
1278
+ </div>
1279
+ </div>
1280
+ </div>
1281
+
1282
+ <div class="hint-bar">
1283
+ <b>Upload:</b> Click or drag media into the panel &nbsp;&middot;&nbsp;
1284
+ <b>Mode:</b> Switch between image and video inference &nbsp;&middot;&nbsp;
1285
+ <b>Model:</b> Change models from the header &nbsp;&middot;&nbsp;
1286
+ <kbd>Clear</kbd> removes the current media
1287
+ </div>
1288
+
1289
+ <div class="examples-section">
1290
+ <div class="examples-title">Quick Examples</div>
1291
+ <div class="examples-scroll">
1292
+ {EXAMPLE_CARDS_HTML}
1293
+ </div>
1294
+ </div>
1295
+ </div>
1296
+
1297
+ <div class="app-main-right">
1298
+ <div class="panel-card">
1299
+ <div class="panel-card-title">Vision / OCR Instruction</div>
1300
+ <div class="panel-card-body">
1301
+ <label class="modern-label" for="custom-query-input">Query Input</label>
1302
+ <textarea id="custom-query-input" class="modern-textarea" rows="4" placeholder="e.g., perform OCR on this image, describe the document, explain the ad, summarize the video, identify visible text, analyze the scene..."></textarea>
1303
+ </div>
1304
+ </div>
1305
+
1306
+ <div style="padding:12px 20px;">
1307
+ <button id="custom-run-btn" class="btn-run">
1308
+ <span id="run-btn-label">Run Inference</span>
1309
+ </button>
1310
+ </div>
1311
+
1312
+ <div class="output-frame">
1313
+ <div class="out-title">
1314
+ <span id="output-title-label">Raw Output Stream</span>
1315
+ <div class="out-title-right">
1316
+ <button id="copy-output-btn" class="out-action-btn" title="Copy">{COPY_SVG} Copy</button>
1317
+ <button id="save-output-btn" class="out-action-btn" title="Save">{SAVE_SVG} Save File</button>
1318
+ </div>
1319
+ </div>
1320
+ <div class="out-body">
1321
+ <div class="modern-loader" id="output-loader">
1322
+ <div class="loader-spinner"></div>
1323
+ <div class="loader-text">Running inference...</div>
1324
+ <div class="loader-bar-track"><div class="loader-bar-fill"></div></div>
1325
+ </div>
1326
+ <div class="output-scroll-wrap">
1327
+ <textarea id="custom-output-textarea" class="output-textarea" placeholder="Raw output will appear here..." readonly></textarea>
1328
+ </div>
1329
+ </div>
1330
+ </div>
1331
+
1332
+ <div class="settings-group">
1333
+ <div class="settings-group-title">Advanced Settings</div>
1334
+ <div class="settings-group-body">
1335
+ <div class="slider-row">
1336
+ <label>Max new tokens</label>
1337
+ <input type="range" id="custom-max-new-tokens" min="1" max="{MAX_MAX_NEW_TOKENS}" step="1" value="{DEFAULT_MAX_NEW_TOKENS}">
1338
+ <span class="slider-val" id="custom-max-new-tokens-val">{DEFAULT_MAX_NEW_TOKENS}</span>
1339
+ </div>
1340
+ <div class="slider-row">
1341
+ <label>Temperature</label>
1342
+ <input type="range" id="custom-temperature" min="0.1" max="4.0" step="0.1" value="0.6">
1343
+ <span class="slider-val" id="custom-temperature-val">0.6</span>
1344
+ </div>
1345
+ <div class="slider-row">
1346
+ <label>Top-p</label>
1347
+ <input type="range" id="custom-top-p" min="0.05" max="1.0" step="0.05" value="0.9">
1348
+ <span class="slider-val" id="custom-top-p-val">0.9</span>
1349
+ </div>
1350
+ <div class="slider-row">
1351
+ <label>Top-k</label>
1352
+ <input type="range" id="custom-top-k" min="1" max="1000" step="1" value="50">
1353
+ <span class="slider-val" id="custom-top-k-val">50</span>
1354
+ </div>
1355
+ <div class="slider-row">
1356
+ <label>Repetition penalty</label>
1357
+ <input type="range" id="custom-repetition-penalty" min="1.0" max="2.0" step="0.05" value="1.2">
1358
+ <span class="slider-val" id="custom-repetition-penalty-val">1.2</span>
1359
+ </div>
1360
+ <div class="slider-row">
1361
+ <label>GPU Duration (seconds)</label>
1362
+ <input type="range" id="custom-gpu-duration" min="60" max="300" step="30" value="60">
1363
+ <span class="slider-val" id="custom-gpu-duration-val">60</span>
1364
+ </div>
1365
+ </div>
1366
+ </div>
1367
+ </div>
1368
+ </div>
1369
+
1370
+ <div class="exp-note">
1371
+ Experimental document vision suite &middot; Open on <a href="https://github.com/PRITHIVSAKTHIUR/DocScope-R1" target="_blank">GitHub</a>
1372
+ </div>
1373
+
1374
+ <div class="app-statusbar">
1375
+ <div class="sb-section" id="sb-media-status">No image uploaded</div>
1376
+ <div class="sb-section sb-fixed" id="sb-run-state">Ready</div>
1377
+ </div>
1378
+ </div>
1379
+ """)
1380
+
1381
+ run_btn = gr.Button("Run", elem_id="gradio-run-btn")
1382
+
1383
+ demo.load(fn=noop, inputs=None, outputs=None, js=gallery_js)
1384
+ demo.load(fn=noop, inputs=None, outputs=None, js=wire_outputs_js)
1385
+
1386
+ run_btn.click(
1387
+ fn=run_inference,
1388
+ inputs=[
1389
+ hidden_mode_name,
1390
+ hidden_model_name,
1391
+ prompt,
1392
+ hidden_image_b64,
1393
+ hidden_video_b64,
1394
+ max_new_tokens,
1395
+ temperature,
1396
+ top_p,
1397
+ top_k,
1398
+ repetition_penalty,
1399
+ gpu_duration_state,
1400
+ ],
1401
+ outputs=[result],
1402
+ js=r"""(mode, model, p, img, vid, mnt, t, tp, tk, rp, gd) => {
1403
+ const modelEl = document.querySelector('.model-tab.active');
1404
+ const modeEl = document.querySelector('.mode-tab.active');
1405
+ const modelVal = modelEl ? modelEl.getAttribute('data-model') : model;
1406
+ const modeVal = modeEl ? modeEl.getAttribute('data-mode') : mode;
1407
+ const promptEl = document.getElementById('custom-query-input');
1408
+ const promptVal = promptEl ? promptEl.value : p;
1409
+
1410
+ let imgVal = img;
1411
+ let vidVal = vid;
1412
+
1413
+ const imgContainer = document.getElementById('hidden-image-b64');
1414
+ const vidContainer = document.getElementById('hidden-video-b64');
1415
+
1416
+ if (imgContainer) {
1417
+ const inner = imgContainer.querySelector('textarea, input');
1418
+ if (inner) imgVal = inner.value;
1419
+ }
1420
+ if (vidContainer) {
1421
+ const inner = vidContainer.querySelector('textarea, input');
1422
+ if (inner) vidVal = inner.value;
1423
+ }
1424
+
1425
+ return [modeVal, modelVal, promptVal, imgVal, vidVal, mnt, t, tp, tk, rp, gd];
1426
+ }""",
1427
  )
1428
+
1429
+ example_load_btn.click(
1430
+ fn=load_example_data,
1431
+ inputs=[example_idx],
1432
+ outputs=[example_result],
1433
+ queue=False,
1434
  )
1435
 
1436
  if __name__ == "__main__":
1437
+ demo.queue(max_size=30).launch(
1438
+ css=css,
1439
+ mcp_server=True,
1440
+ ssr_mode=False,
1441
+ show_error=True,
1442
+ allowed_paths=["images", "videos"],
1443
+ )