userisuser Cursor commited on
Commit
cae3a38
·
1 Parent(s): 57ab14f

Load both MiniCPM-V 4.6 variants at startup

Browse files

Co-authored-by: Cursor <cursoragent@cursor.com>

Files changed (3) hide show
  1. README.md +1 -0
  2. app.py +14 -4
  3. v46/app.py +39 -19
README.md CHANGED
@@ -9,6 +9,7 @@ python_version: "3.12"
9
  app_file: app.py
10
  models:
11
  - openbmb/MiniCPM-V-4.6
 
12
  pinned: false
13
  short_description: MiniCPM-V 4.6 Ultra-Efficient Multimodal AI
14
  ---
 
9
  app_file: app.py
10
  models:
11
  - openbmb/MiniCPM-V-4.6
12
+ - openbmb/MiniCPM-V-4.6-Thinking
13
  pinned: false
14
  short_description: MiniCPM-V 4.6 Ultra-Efficient Multimodal AI
15
  ---
app.py CHANGED
@@ -3,15 +3,25 @@ import os
3
  import spaces
4
  from v46 import app as v46_app
5
 
6
- MODEL_ID = os.environ.get("V46_MODEL_ID", "openbmb/MiniCPM-V-4.6")
 
7
  DEVICE = os.environ.get("V46_DEVICE", "cuda")
8
  DEFAULT_THINKING = os.environ.get("V46_DEFAULT_THINKING", "0") == "1"
9
  GPU_DURATION = int(os.environ.get("V46_GPU_DURATION", "300"))
10
 
11
- print(f"[official-space] lazy model config: {MODEL_ID} on {DEVICE}", flush=True)
12
- v46_app.configure_lazy_models(instruct_path=MODEL_ID, device=DEVICE)
 
 
 
 
 
 
 
 
13
 
14
- # ZeroGPU exposes CUDA only while decorated callbacks are running.
 
15
  v46_app.native_chat_respond = spaces.GPU(duration=GPU_DURATION)(v46_app.native_chat_respond)
16
  v46_app.native_fewshot_respond = spaces.GPU(duration=GPU_DURATION)(v46_app.native_fewshot_respond)
17
 
 
3
  import spaces
4
  from v46 import app as v46_app
5
 
6
+ INSTRUCT_MODEL_ID = os.environ.get("V46_INSTRUCT_MODEL_ID", "openbmb/MiniCPM-V-4.6")
7
+ THINKING_MODEL_ID = os.environ.get("V46_THINKING_MODEL_ID", "openbmb/MiniCPM-V-4.6-Thinking")
8
  DEVICE = os.environ.get("V46_DEVICE", "cuda")
9
  DEFAULT_THINKING = os.environ.get("V46_DEFAULT_THINKING", "0") == "1"
10
  GPU_DURATION = int(os.environ.get("V46_GPU_DURATION", "300"))
11
 
12
+ print(
13
+ f"[official-space] loading models at module startup: "
14
+ f"instruct={INSTRUCT_MODEL_ID}, thinking={THINKING_MODEL_ID}, device={DEVICE}",
15
+ flush=True,
16
+ )
17
+ v46_app.load_models(
18
+ instruct_path=INSTRUCT_MODEL_ID,
19
+ thinking_path=THINKING_MODEL_ID,
20
+ device=DEVICE,
21
+ )
22
 
23
+ # ZeroGPU docs recommend placing models on cuda at module level and
24
+ # decorating GPU-dependent callbacks.
25
  v46_app.native_chat_respond = spaces.GPU(duration=GPU_DURATION)(v46_app.native_chat_respond)
26
  v46_app.native_fewshot_respond = spaces.GPU(duration=GPU_DURATION)(v46_app.native_fewshot_respond)
27
 
v46/app.py CHANGED
@@ -1099,7 +1099,6 @@ def on_thinking_toggle(thinking_mode, chat_bot, app_session):
1099
  return gr.update(), gr.update(), app_session, \
1100
  gr.update(), gr.update(), gr.update()
1101
 
1102
- gr.Info(f"Switched to '{new_variant}' model, history cleared.")
1103
  app_session["ctx"] = []
1104
  app_session["images_cnt"] = 0
1105
  app_session["videos_cnt"] = 0
@@ -1302,6 +1301,20 @@ def native_remove_last_turn(chat_messages, app_cfg):
1302
  return last_turn, chat_messages, app_cfg
1303
 
1304
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1305
  def native_chat_respond(user_input, chat_messages, app_cfg,
1306
  params_form, thinking_mode, streaming_mode,
1307
  max_new_tokens, temperature, top_p, top_k, max_frames,
@@ -1320,23 +1333,24 @@ def native_chat_respond(user_input, chat_messages, app_cfg,
1320
  yield gr.update(), chat_messages, app_cfg, gr.update(visible=False)
1321
  return
1322
 
 
 
 
 
 
 
 
 
1323
  chat_messages = list(chat_messages or [])
1324
  display_start = len(chat_messages)
1325
  chat_messages.extend(native_display_user_messages(text, files))
1326
  assistant_index = len(chat_messages)
1327
- chat_messages.append({"role": "assistant", "content": "⏳ Processing…"})
1328
  yield native_empty_input(), chat_messages, app_cfg, gr.update(visible=True)
1329
 
1330
  ctx = app_cfg.get("ctx", [])
1331
  messages = [{"role": item["role"], "content": copy.copy(item["content"])} for item in ctx]
1332
  messages.append({"role": "user", "content": user_content})
1333
- sampling = (params_form == "Sampling")
1334
- if not sampling:
1335
- streaming_mode = False
1336
- use_thinking = bool(thinking_mode)
1337
- variant = pick_variant(use_thinking)
1338
- enable_thinking = use_thinking and variant == "thinking"
1339
- app_cfg["current_variant"] = variant
1340
  print(f"[native] respond variant={variant} enable_thinking={enable_thinking}", flush=True)
1341
 
1342
  try:
@@ -1464,23 +1478,24 @@ def native_fewshot_respond(_image, _user_message, _chat_messages, _app_cfg,
1464
  yield _image, _user_message, "", _chat_messages, _app_cfg, gr.update(visible=False)
1465
  return
1466
 
 
 
 
 
 
 
 
 
1467
  _chat_messages = list(_chat_messages or [])
1468
  display_start = len(_chat_messages)
1469
  _chat_messages.extend(native_display_user_messages(_user_message or "", files))
1470
  assistant_index = len(_chat_messages)
1471
- _chat_messages.append({"role": "assistant", "content": "⏳ Processing…"})
1472
  yield None, "", "", _chat_messages, _app_cfg, gr.update(visible=True)
1473
 
1474
  ctx = list(_app_cfg.get("ctx", []))
1475
  messages = [{"role": item["role"], "content": copy.copy(item["content"])} for item in ctx]
1476
  messages.append({"role": "user", "content": user_content})
1477
- sampling = (params_form == "Sampling")
1478
- if not sampling:
1479
- streaming_mode = False
1480
- use_thinking = bool(thinking_mode)
1481
- variant = pick_variant(use_thinking)
1482
- enable_thinking = use_thinking and variant == "thinking"
1483
- _app_cfg["current_variant"] = variant
1484
  print(f"[native] fewshot variant={variant} enable_thinking={enable_thinking}", flush=True)
1485
 
1486
  try:
@@ -1600,8 +1615,6 @@ def native_clear_all(txt_message, chat_messages, app_session):
1600
 
1601
  def native_on_thinking_toggle(thinking_mode, chat_messages, app_session):
1602
  target_variant = pick_variant(bool(thinking_mode))
1603
- if target_variant != app_session.get("current_variant"):
1604
- gr.Info(f"Switched to '{target_variant}' model, history cleared.")
1605
  app_session["current_variant"] = target_variant
1606
  return native_clear_all(None, chat_messages, app_session)
1607
 
@@ -1778,6 +1791,7 @@ def build_ui(model_display_name: str, default_thinking: bool):
1778
  params_form, thinking_mode, streaming_mode,
1779
  max_new_tokens, temperature, top_p, top_k, max_frames],
1780
  [txt_message, chat_bot, app_session, stop_btn],
 
1781
  )
1782
 
1783
  with gr.Tab("Few Shot") as fewshot_tab:
@@ -1809,6 +1823,7 @@ def build_ui(model_display_name: str, default_thinking: bool):
1809
  chat_bot, app_session],
1810
  [image_input, user_message, assistant_message,
1811
  chat_bot, app_session],
 
1812
  )
1813
  generate_btn.click(
1814
  native_fewshot_respond,
@@ -1817,6 +1832,7 @@ def build_ui(model_display_name: str, default_thinking: bool):
1817
  max_new_tokens, temperature, top_p, top_k, max_frames],
1818
  [image_input, user_message, assistant_message,
1819
  chat_bot, app_session, stop_btn],
 
1820
  )
1821
 
1822
  # Tab switch events: remember current tab + clear state
@@ -1853,6 +1869,7 @@ def build_ui(model_display_name: str, default_thinking: bool):
1853
  inputs=[thinking_mode, chat_bot, app_session],
1854
  outputs=[txt_message, chat_bot, app_session,
1855
  image_input, user_message, assistant_message],
 
1856
  )
1857
  regenerate_btn.click(
1858
  native_regenerate_clicked,
@@ -1860,17 +1877,20 @@ def build_ui(model_display_name: str, default_thinking: bool):
1860
  params_form, thinking_mode, streaming_mode,
1861
  max_new_tokens, temperature, top_p, top_k, max_frames],
1862
  [txt_message, chat_bot, app_session, stop_btn],
 
1863
  )
1864
  clear_btn.click(
1865
  native_clear_all,
1866
  [txt_message, chat_bot, app_session],
1867
  [txt_message, chat_bot, app_session,
1868
  image_input, user_message, assistant_message],
 
1869
  )
1870
  stop_btn.click(
1871
  stop_clicked,
1872
  [app_session],
1873
  [app_session, stop_btn],
 
1874
  )
1875
 
1876
  with gr.Tab("How to use"):
 
1099
  return gr.update(), gr.update(), app_session, \
1100
  gr.update(), gr.update(), gr.update()
1101
 
 
1102
  app_session["ctx"] = []
1103
  app_session["images_cnt"] = 0
1104
  app_session["videos_cnt"] = 0
 
1301
  return last_turn, chat_messages, app_cfg
1302
 
1303
 
1304
+ def model_call_status_message(variant: str) -> str:
1305
+ if variant in MODELS:
1306
+ return "⏳ Processing…"
1307
+ if variant == "thinking":
1308
+ return (
1309
+ "⏳ Loading the Thinking model. "
1310
+ "Please wait…"
1311
+ )
1312
+ return (
1313
+ "⏳ Loading the model. "
1314
+ "Please wait…"
1315
+ )
1316
+
1317
+
1318
  def native_chat_respond(user_input, chat_messages, app_cfg,
1319
  params_form, thinking_mode, streaming_mode,
1320
  max_new_tokens, temperature, top_p, top_k, max_frames,
 
1333
  yield gr.update(), chat_messages, app_cfg, gr.update(visible=False)
1334
  return
1335
 
1336
+ sampling = (params_form == "Sampling")
1337
+ if not sampling:
1338
+ streaming_mode = False
1339
+ use_thinking = bool(thinking_mode)
1340
+ variant = pick_variant(use_thinking)
1341
+ enable_thinking = use_thinking and variant == "thinking"
1342
+ app_cfg["current_variant"] = variant
1343
+
1344
  chat_messages = list(chat_messages or [])
1345
  display_start = len(chat_messages)
1346
  chat_messages.extend(native_display_user_messages(text, files))
1347
  assistant_index = len(chat_messages)
1348
+ chat_messages.append({"role": "assistant", "content": model_call_status_message(variant)})
1349
  yield native_empty_input(), chat_messages, app_cfg, gr.update(visible=True)
1350
 
1351
  ctx = app_cfg.get("ctx", [])
1352
  messages = [{"role": item["role"], "content": copy.copy(item["content"])} for item in ctx]
1353
  messages.append({"role": "user", "content": user_content})
 
 
 
 
 
 
 
1354
  print(f"[native] respond variant={variant} enable_thinking={enable_thinking}", flush=True)
1355
 
1356
  try:
 
1478
  yield _image, _user_message, "", _chat_messages, _app_cfg, gr.update(visible=False)
1479
  return
1480
 
1481
+ sampling = (params_form == "Sampling")
1482
+ if not sampling:
1483
+ streaming_mode = False
1484
+ use_thinking = bool(thinking_mode)
1485
+ variant = pick_variant(use_thinking)
1486
+ enable_thinking = use_thinking and variant == "thinking"
1487
+ _app_cfg["current_variant"] = variant
1488
+
1489
  _chat_messages = list(_chat_messages or [])
1490
  display_start = len(_chat_messages)
1491
  _chat_messages.extend(native_display_user_messages(_user_message or "", files))
1492
  assistant_index = len(_chat_messages)
1493
+ _chat_messages.append({"role": "assistant", "content": model_call_status_message(variant)})
1494
  yield None, "", "", _chat_messages, _app_cfg, gr.update(visible=True)
1495
 
1496
  ctx = list(_app_cfg.get("ctx", []))
1497
  messages = [{"role": item["role"], "content": copy.copy(item["content"])} for item in ctx]
1498
  messages.append({"role": "user", "content": user_content})
 
 
 
 
 
 
 
1499
  print(f"[native] fewshot variant={variant} enable_thinking={enable_thinking}", flush=True)
1500
 
1501
  try:
 
1615
 
1616
  def native_on_thinking_toggle(thinking_mode, chat_messages, app_session):
1617
  target_variant = pick_variant(bool(thinking_mode))
 
 
1618
  app_session["current_variant"] = target_variant
1619
  return native_clear_all(None, chat_messages, app_session)
1620
 
 
1791
  params_form, thinking_mode, streaming_mode,
1792
  max_new_tokens, temperature, top_p, top_k, max_frames],
1793
  [txt_message, chat_bot, app_session, stop_btn],
1794
+ show_progress="hidden",
1795
  )
1796
 
1797
  with gr.Tab("Few Shot") as fewshot_tab:
 
1823
  chat_bot, app_session],
1824
  [image_input, user_message, assistant_message,
1825
  chat_bot, app_session],
1826
+ show_progress="hidden",
1827
  )
1828
  generate_btn.click(
1829
  native_fewshot_respond,
 
1832
  max_new_tokens, temperature, top_p, top_k, max_frames],
1833
  [image_input, user_message, assistant_message,
1834
  chat_bot, app_session, stop_btn],
1835
+ show_progress="hidden",
1836
  )
1837
 
1838
  # Tab switch events: remember current tab + clear state
 
1869
  inputs=[thinking_mode, chat_bot, app_session],
1870
  outputs=[txt_message, chat_bot, app_session,
1871
  image_input, user_message, assistant_message],
1872
+ show_progress="hidden",
1873
  )
1874
  regenerate_btn.click(
1875
  native_regenerate_clicked,
 
1877
  params_form, thinking_mode, streaming_mode,
1878
  max_new_tokens, temperature, top_p, top_k, max_frames],
1879
  [txt_message, chat_bot, app_session, stop_btn],
1880
+ show_progress="hidden",
1881
  )
1882
  clear_btn.click(
1883
  native_clear_all,
1884
  [txt_message, chat_bot, app_session],
1885
  [txt_message, chat_bot, app_session,
1886
  image_input, user_message, assistant_message],
1887
+ show_progress="hidden",
1888
  )
1889
  stop_btn.click(
1890
  stop_clicked,
1891
  [app_session],
1892
  [app_session, stop_btn],
1893
+ show_progress="hidden",
1894
  )
1895
 
1896
  with gr.Tab("How to use"):