qic999 commited on
Commit
5a11a50
·
verified ·
1 Parent(s): a18823a

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +494 -0
  2. UMM/BLIP3o-Qwen3-Siglip2/README.md +31 -0
  3. UMM/BLIP3o-Qwen3-Siglip2/blip3o/.DS_Store +0 -0
  4. UMM/BLIP3o-Qwen3-Siglip2/blip3o/__init__.py +0 -0
  5. UMM/BLIP3o-Qwen3-Siglip2/blip3o/__pycache__/__init__.cpython-311.pyc +0 -0
  6. UMM/BLIP3o-Qwen3-Siglip2/blip3o/__pycache__/constants.cpython-311.pyc +0 -0
  7. UMM/BLIP3o-Qwen3-Siglip2/blip3o/__pycache__/conversation.cpython-311.pyc +0 -0
  8. UMM/BLIP3o-Qwen3-Siglip2/blip3o/__pycache__/mm_utils.cpython-311.pyc +0 -0
  9. UMM/BLIP3o-Qwen3-Siglip2/blip3o/__pycache__/utils.cpython-311.pyc +0 -0
  10. UMM/BLIP3o-Qwen3-Siglip2/blip3o/constants.py +26 -0
  11. UMM/BLIP3o-Qwen3-Siglip2/blip3o/conversation.py +476 -0
  12. UMM/BLIP3o-Qwen3-Siglip2/blip3o/mm_utils.py +247 -0
  13. UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/__init__.py +4 -0
  14. UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/__pycache__/__init__.cpython-311.pyc +0 -0
  15. UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/__pycache__/blip3o_arch.cpython-311.pyc +0 -0
  16. UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/__pycache__/builder.cpython-311.pyc +0 -0
  17. UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/__pycache__/llava_arch.cpython-311.pyc +0 -0
  18. UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/__pycache__/lumina_nextdit2d.cpython-311.pyc +0 -0
  19. UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/__pycache__/nextdit_crossattn.cpython-311.pyc +0 -0
  20. UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/apply_delta.py +48 -0
  21. UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/blip3o_arch.py +371 -0
  22. UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/builder.py +103 -0
  23. UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/consolidate.py +25 -0
  24. UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/language_model/__pycache__/blip3o_llama.cpython-311.pyc +0 -0
  25. UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/language_model/__pycache__/blip3o_qwen.cpython-311.pyc +0 -0
  26. UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/language_model/__pycache__/blip3o_qwen_inference.cpython-311.pyc +0 -0
  27. UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/language_model/__pycache__/llava_llama.cpython-311.pyc +0 -0
  28. UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/language_model/__pycache__/llava_qwen.cpython-311.pyc +0 -0
  29. UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/language_model/blip3o_qwen.py +421 -0
  30. UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/language_model/blip3o_qwen_inference.py +418 -0
  31. UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/lumina_nextdit2d.py +365 -0
  32. UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/make_delta.py +48 -0
  33. UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/multimodal_encoder/__pycache__/builder.cpython-311.pyc +0 -0
  34. UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/multimodal_encoder/__pycache__/clip_encoder.cpython-311.pyc +0 -0
  35. UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/multimodal_encoder/__pycache__/imagebind.cpython-311.pyc +0 -0
  36. UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/multimodal_encoder/__pycache__/open_clip_encoder.cpython-311.pyc +0 -0
  37. UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/multimodal_encoder/__pycache__/siglip_encoder.cpython-311.pyc +0 -0
  38. UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/multimodal_encoder/builder.py +55 -0
  39. UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/multimodal_encoder/clip_encoder.py +172 -0
  40. UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/multimodal_encoder/dev_eva_clip/__pycache__/eva_vit.cpython-311.pyc +0 -0
  41. UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/multimodal_encoder/dev_eva_clip/eva_clip/__init__.py +9 -0
  42. UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/multimodal_encoder/dev_eva_clip/eva_clip/__pycache__/__init__.cpython-311.pyc +0 -0
  43. UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/multimodal_encoder/dev_eva_clip/eva_clip/__pycache__/constants.cpython-311.pyc +0 -0
  44. UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/multimodal_encoder/dev_eva_clip/eva_clip/__pycache__/eva_vit_model.cpython-311.pyc +0 -0
  45. UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/multimodal_encoder/dev_eva_clip/eva_clip/__pycache__/factory.cpython-311.pyc +0 -0
  46. UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/multimodal_encoder/dev_eva_clip/eva_clip/__pycache__/hf_configs.cpython-311.pyc +0 -0
  47. UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/multimodal_encoder/dev_eva_clip/eva_clip/__pycache__/hf_model.cpython-311.pyc +0 -0
  48. UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/multimodal_encoder/dev_eva_clip/eva_clip/__pycache__/loss.cpython-311.pyc +0 -0
  49. UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/multimodal_encoder/dev_eva_clip/eva_clip/__pycache__/model.cpython-311.pyc +0 -0
  50. UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/multimodal_encoder/dev_eva_clip/eva_clip/__pycache__/modified_resnet.cpython-311.pyc +0 -0
.gitattributes CHANGED
@@ -7408,3 +7408,497 @@ not_work/GenCT-ageencoder-casualatte-frozentext2-multimlp/logs/full_ct_2d_with_b
7408
  not_work/GenCT-ageencoder-casualatte-frozentext2-multimlp/logs/full_ct_2d_with_body_mask/inference/BDMAP_00009151_sample_8.png filter=lfs diff=lfs merge=lfs -text
7409
  not_work/GenCT-ageencoder-casualatte-frozentext2-multimlp/logs/full_ct_2d_with_body_mask/inference/BDMAP_00009151_sample_9.png filter=lfs diff=lfs merge=lfs -text
7410
  not_work/GenCT-ageencoder-casualatte-frozentext2-multimlp/misc/overview.jpg filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7408
  not_work/GenCT-ageencoder-casualatte-frozentext2-multimlp/logs/full_ct_2d_with_body_mask/inference/BDMAP_00009151_sample_8.png filter=lfs diff=lfs merge=lfs -text
7409
  not_work/GenCT-ageencoder-casualatte-frozentext2-multimlp/logs/full_ct_2d_with_body_mask/inference/BDMAP_00009151_sample_9.png filter=lfs diff=lfs merge=lfs -text
7410
  not_work/GenCT-ageencoder-casualatte-frozentext2-multimlp/misc/overview.jpg filter=lfs diff=lfs merge=lfs -text
7411
+ UMM/BLIP3o-Qwen3-Siglip2/eval/lmms-eval/lmms_eval/tasks/mmmu/arial.ttf filter=lfs diff=lfs merge=lfs -text
7412
+ UMM/BLIP3o-Qwen3-Siglip2/eval/lmms-eval/tools/live_bench/live_bench/data_generator/example/example_website.png filter=lfs diff=lfs merge=lfs -text
7413
+ UMM/BLIP3o-Qwen3-Siglip2/figure/arch.png filter=lfs diff=lfs merge=lfs -text
7414
+ UMM/BLIP3o-Qwen3-Siglip2/figure/image.png filter=lfs diff=lfs merge=lfs -text
7415
+ UMM/BLIP3o-Qwen3-Siglip2/figure/wechat_1.jpg filter=lfs diff=lfs merge=lfs -text
7416
+ UMM/BLIP3o-Qwen3-Siglip2/figure/wechat_2.jpg filter=lfs diff=lfs merge=lfs -text
7417
+ UMM/BLIP3o-Qwen3-Siglip2/gradio/animal-compare.png filter=lfs diff=lfs merge=lfs -text
7418
+ UMM/BLIP3o-main/eval/lmms-eval/lmms_eval/tasks/mmmu/arial.ttf filter=lfs diff=lfs merge=lfs -text
7419
+ UMM/BLIP3o-main/eval/lmms-eval/tools/live_bench/live_bench/data_generator/example/example_website.png filter=lfs diff=lfs merge=lfs -text
7420
+ UMM/BLIP3o-main/figure/arch.png filter=lfs diff=lfs merge=lfs -text
7421
+ UMM/BLIP3o-main/figure/image.png filter=lfs diff=lfs merge=lfs -text
7422
+ UMM/BLIP3o-main/figure/wechat_2.jpg filter=lfs diff=lfs merge=lfs -text
7423
+ UMM/BLIP3o-main/gradio/animal-compare.png filter=lfs diff=lfs merge=lfs -text
7424
+ UMM/Bagel-Med/assets/arch.png filter=lfs diff=lfs merge=lfs -text
7425
+ UMM/Bagel-Med/assets/emerging_curves.png filter=lfs diff=lfs merge=lfs -text
7426
+ UMM/Bagel-Med/assets/teaser.webp filter=lfs diff=lfs merge=lfs -text
7427
+ UMM/Bagel-Med/inference.ipynb filter=lfs diff=lfs merge=lfs -text
7428
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/000000013318.jpg filter=lfs diff=lfs merge=lfs -text
7429
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/000000069500.jpg filter=lfs diff=lfs merge=lfs -text
7430
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/000000126457.jpg filter=lfs diff=lfs merge=lfs -text
7431
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/000000128172.jpg filter=lfs diff=lfs merge=lfs -text
7432
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/000000162435.jpg filter=lfs diff=lfs merge=lfs -text
7433
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/000000269996.jpg filter=lfs diff=lfs merge=lfs -text
7434
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/000000409897.jpg filter=lfs diff=lfs merge=lfs -text
7435
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/000000429873.jpg filter=lfs diff=lfs merge=lfs -text
7436
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/00548dfc8ec76f5d.jpg filter=lfs diff=lfs merge=lfs -text
7437
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/008447.jpg filter=lfs diff=lfs merge=lfs -text
7438
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/008548.jpg filter=lfs diff=lfs merge=lfs -text
7439
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/010706.jpg filter=lfs diff=lfs merge=lfs -text
7440
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/016721.jpg filter=lfs diff=lfs merge=lfs -text
7441
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/024486.jpg filter=lfs diff=lfs merge=lfs -text
7442
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/027170.jpg filter=lfs diff=lfs merge=lfs -text
7443
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/030168.jpg filter=lfs diff=lfs merge=lfs -text
7444
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/032672906be2e4c9.jpg filter=lfs diff=lfs merge=lfs -text
7445
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/033215.jpg filter=lfs diff=lfs merge=lfs -text
7446
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/036195.jpg filter=lfs diff=lfs merge=lfs -text
7447
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/038257.png filter=lfs diff=lfs merge=lfs -text
7448
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/038468.jpg filter=lfs diff=lfs merge=lfs -text
7449
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/043701.jpg filter=lfs diff=lfs merge=lfs -text
7450
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/051169.jpg filter=lfs diff=lfs merge=lfs -text
7451
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/052849.jpg filter=lfs diff=lfs merge=lfs -text
7452
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/057680.jpg filter=lfs diff=lfs merge=lfs -text
7453
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/060165.jpg filter=lfs diff=lfs merge=lfs -text
7454
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/062009.jpg filter=lfs diff=lfs merge=lfs -text
7455
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/062571.png filter=lfs diff=lfs merge=lfs -text
7456
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/063b239d-0ce7-4fdd-8984-17a823e92db8.jpg filter=lfs diff=lfs merge=lfs -text
7457
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/064115.png filter=lfs diff=lfs merge=lfs -text
7458
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/065501.jpg filter=lfs diff=lfs merge=lfs -text
7459
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/071939.jpg filter=lfs diff=lfs merge=lfs -text
7460
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/071969.jpg filter=lfs diff=lfs merge=lfs -text
7461
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/072068.jpg filter=lfs diff=lfs merge=lfs -text
7462
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/076318.jpg filter=lfs diff=lfs merge=lfs -text
7463
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/076910.jpg filter=lfs diff=lfs merge=lfs -text
7464
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/077021.jpg filter=lfs diff=lfs merge=lfs -text
7465
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/083379.jpg filter=lfs diff=lfs merge=lfs -text
7466
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/083752.jpg filter=lfs diff=lfs merge=lfs -text
7467
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/086628.jpg filter=lfs diff=lfs merge=lfs -text
7468
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/089255.jpg filter=lfs diff=lfs merge=lfs -text
7469
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/090048.jpg filter=lfs diff=lfs merge=lfs -text
7470
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/092505.jpg filter=lfs diff=lfs merge=lfs -text
7471
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/094888.jpg filter=lfs diff=lfs merge=lfs -text
7472
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/096894.jpg filter=lfs diff=lfs merge=lfs -text
7473
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/099141.jpg filter=lfs diff=lfs merge=lfs -text
7474
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/0cd50fe4a0c2b85b.jpg filter=lfs diff=lfs merge=lfs -text
7475
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/1-1277350-3.png filter=lfs diff=lfs merge=lfs -text
7476
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/1-15838081-3.png filter=lfs diff=lfs merge=lfs -text
7477
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/1-2082992-5.png filter=lfs diff=lfs merge=lfs -text
7478
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/1-25926120-7.png filter=lfs diff=lfs merge=lfs -text
7479
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/1-27435931-1.png filter=lfs diff=lfs merge=lfs -text
7480
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/1-29087004-3.png filter=lfs diff=lfs merge=lfs -text
7481
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/10011.jpeg.jpg filter=lfs diff=lfs merge=lfs -text
7482
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/10751.jpeg filter=lfs diff=lfs merge=lfs -text
7483
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/10820.jpeg filter=lfs diff=lfs merge=lfs -text
7484
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/11534.jpeg filter=lfs diff=lfs merge=lfs -text
7485
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/121350e0df26f3b6.jpg filter=lfs diff=lfs merge=lfs -text
7486
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/146c68bec53d6555.jpg filter=lfs diff=lfs merge=lfs -text
7487
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/18310.png filter=lfs diff=lfs merge=lfs -text
7488
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/18361a25529ab455.jpg filter=lfs diff=lfs merge=lfs -text
7489
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/19745.png filter=lfs diff=lfs merge=lfs -text
7490
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/1e4f9f282cb0cd13.jpg filter=lfs diff=lfs merge=lfs -text
7491
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/2-10166535-6.png filter=lfs diff=lfs merge=lfs -text
7492
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/2-10220225-2.png filter=lfs diff=lfs merge=lfs -text
7493
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/2-10809142-10.png filter=lfs diff=lfs merge=lfs -text
7494
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/2-10834877-1.png filter=lfs diff=lfs merge=lfs -text
7495
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/2-1132593-1.png filter=lfs diff=lfs merge=lfs -text
7496
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/2-11783766-1.png filter=lfs diff=lfs merge=lfs -text
7497
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/2-11960713-3.png filter=lfs diff=lfs merge=lfs -text
7498
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/2-12292738-1.png filter=lfs diff=lfs merge=lfs -text
7499
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/2-13193466-16.png filter=lfs diff=lfs merge=lfs -text
7500
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/2-14415913-1.png filter=lfs diff=lfs merge=lfs -text
7501
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/2-15198842-1.png filter=lfs diff=lfs merge=lfs -text
7502
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/2-15230272-3.png filter=lfs diff=lfs merge=lfs -text
7503
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/2-15524351-19.png filter=lfs diff=lfs merge=lfs -text
7504
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/2-16452451-2.png filter=lfs diff=lfs merge=lfs -text
7505
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/2-16514275-1.png filter=lfs diff=lfs merge=lfs -text
7506
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/2-16678064-7.png filter=lfs diff=lfs merge=lfs -text
7507
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/2-16678076-2.png filter=lfs diff=lfs merge=lfs -text
7508
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/2-18662026-1.png filter=lfs diff=lfs merge=lfs -text
7509
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/2018%2F04%2F12%2F5-things-im-looking-forward-to-at-the-cc-global-summit.png filter=lfs diff=lfs merge=lfs -text
7510
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/2d48a140b40ab164.jpg filter=lfs diff=lfs merge=lfs -text
7511
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/30800.jpeg filter=lfs diff=lfs merge=lfs -text
7512
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/30962.jpeg filter=lfs diff=lfs merge=lfs -text
7513
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/31351.jpeg filter=lfs diff=lfs merge=lfs -text
7514
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/32e250aa3b3ac720.jpg filter=lfs diff=lfs merge=lfs -text
7515
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/35699.jpeg filter=lfs diff=lfs merge=lfs -text
7516
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/35853.jpeg filter=lfs diff=lfs merge=lfs -text
7517
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/368cd4700649c9cbd86d216dae633759_page0.jpg filter=lfs diff=lfs merge=lfs -text
7518
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/394661ab74481234.jpg filter=lfs diff=lfs merge=lfs -text
7519
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/39952.jpeg filter=lfs diff=lfs merge=lfs -text
7520
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/3d951179-22dd-424a-8710-c37c23cdc53f.jpg filter=lfs diff=lfs merge=lfs -text
7521
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/3eb46fdf7959dde6.jpg filter=lfs diff=lfs merge=lfs -text
7522
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/3ff51c29c8072953.jpg filter=lfs diff=lfs merge=lfs -text
7523
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/4166.png filter=lfs diff=lfs merge=lfs -text
7524
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/4276c80c3b7930c9.jpg filter=lfs diff=lfs merge=lfs -text
7525
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/47118cff0c0fc78e.jpg filter=lfs diff=lfs merge=lfs -text
7526
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/4a605f94-9c59-4f9b-b065-8149fd85ab04.jpg filter=lfs diff=lfs merge=lfs -text
7527
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/4d7df8bc913ce40d.jpg filter=lfs diff=lfs merge=lfs -text
7528
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/5b6b4973fa4dab6f.jpg filter=lfs diff=lfs merge=lfs -text
7529
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/6753f457c6586afb.jpg filter=lfs diff=lfs merge=lfs -text
7530
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/6961db28b3fb0fa6.jpg filter=lfs diff=lfs merge=lfs -text
7531
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/6986ef13-a72d-4f1d-a56c-5aad081fc99f.jpg filter=lfs diff=lfs merge=lfs -text
7532
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/7cf8aae04fe5d666.jpg filter=lfs diff=lfs merge=lfs -text
7533
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/89f64c747cf016ec.jpg filter=lfs diff=lfs merge=lfs -text
7534
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/8bb96c2c-44ae-44d5-a9b0-bd1dae6f606e.jpg filter=lfs diff=lfs merge=lfs -text
7535
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/9062612221d6858f.jpg filter=lfs diff=lfs merge=lfs -text
7536
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/944610512a34aa8c.jpg filter=lfs diff=lfs merge=lfs -text
7537
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/98c23700d1fa72a2501413686351895d_page0.jpg filter=lfs diff=lfs merge=lfs -text
7538
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/9957.png filter=lfs diff=lfs merge=lfs -text
7539
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/9cac76ee-185e-4f5d-b37a-f3dee758b6ad.jpg filter=lfs diff=lfs merge=lfs -text
7540
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/Q-Instruct-DB_spaq_koniq_5001866078.jpg filter=lfs diff=lfs merge=lfs -text
7541
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/__browse__family-and-whanau__getting-married__civil-unions.png filter=lfs diff=lfs merge=lfs -text
7542
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/__story__i-help-people-live-until-very-last-moment.png filter=lfs diff=lfs merge=lfs -text
7543
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/a04da161a9471f14.jpg filter=lfs diff=lfs merge=lfs -text
7544
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/a16af064214aa8cd.jpg filter=lfs diff=lfs merge=lfs -text
7545
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/afcf8b37411dc885.jpg filter=lfs diff=lfs merge=lfs -text
7546
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/allava_laion_inst_102149.jpg filter=lfs diff=lfs merge=lfs -text
7547
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/allava_laion_inst_103811.jpg filter=lfs diff=lfs merge=lfs -text
7548
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/allava_laion_inst_11000.jpg filter=lfs diff=lfs merge=lfs -text
7549
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/allava_laion_inst_110113.jpg filter=lfs diff=lfs merge=lfs -text
7550
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/allava_laion_inst_114688.jpg filter=lfs diff=lfs merge=lfs -text
7551
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/allava_laion_inst_129672.jpg filter=lfs diff=lfs merge=lfs -text
7552
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/allava_laion_inst_140217.jpg filter=lfs diff=lfs merge=lfs -text
7553
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/allava_laion_inst_14538.jpg filter=lfs diff=lfs merge=lfs -text
7554
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/allava_laion_inst_145456.jpg filter=lfs diff=lfs merge=lfs -text
7555
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/allava_laion_inst_151486.jpg filter=lfs diff=lfs merge=lfs -text
7556
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/allava_vflan_inst_17341.jpg filter=lfs diff=lfs merge=lfs -text
7557
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/allava_vflan_inst_4060.jpg filter=lfs diff=lfs merge=lfs -text
7558
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/aokvqa_00000078.png filter=lfs diff=lfs merge=lfs -text
7559
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/aokvqa_00001895.png filter=lfs diff=lfs merge=lfs -text
7560
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/aokvqa_00002763.png filter=lfs diff=lfs merge=lfs -text
7561
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/aokvqa_00003400.png filter=lfs diff=lfs merge=lfs -text
7562
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/aokvqa_00003581.png filter=lfs diff=lfs merge=lfs -text
7563
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/aokvqa_00005417.png filter=lfs diff=lfs merge=lfs -text
7564
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/aokvqa_00005631.png filter=lfs diff=lfs merge=lfs -text
7565
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/aokvqa_00007073.png filter=lfs diff=lfs merge=lfs -text
7566
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/aokvqa_00015635.png filter=lfs diff=lfs merge=lfs -text
7567
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/aokvqa_00016326.png filter=lfs diff=lfs merge=lfs -text
7568
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/aokvqa_00016496.png filter=lfs diff=lfs merge=lfs -text
7569
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/articles%2F10.3389%2Ffrym.2019.00143.png filter=lfs diff=lfs merge=lfs -text
7570
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/b7611a502ccb5e78.jpg filter=lfs diff=lfs merge=lfs -text
7571
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/c57e7036951a271d.jpg filter=lfs diff=lfs merge=lfs -text
7572
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/c7635ad85e7d41de.jpg filter=lfs diff=lfs merge=lfs -text
7573
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/cf2bb1fe-8d74-4661-b47c-a0374aa6ada7.jpg filter=lfs diff=lfs merge=lfs -text
7574
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/clevr_00000282.png filter=lfs diff=lfs merge=lfs -text
7575
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/clevr_00000673.png filter=lfs diff=lfs merge=lfs -text
7576
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/clevr_CLEVR_v1.0_images_train_CLEVR_train_001004.png filter=lfs diff=lfs merge=lfs -text
7577
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/clevr_CLEVR_v1.0_images_train_CLEVR_train_002089.png filter=lfs diff=lfs merge=lfs -text
7578
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/clevr_CLEVR_v1.0_images_train_CLEVR_train_003884.png filter=lfs diff=lfs merge=lfs -text
7579
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/clevr_CLEVR_v1.0_images_train_CLEVR_train_025922.png filter=lfs diff=lfs merge=lfs -text
7580
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/clevr_CLEVR_v1.0_images_train_CLEVR_train_044970.png filter=lfs diff=lfs merge=lfs -text
7581
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/clevr_CLEVR_v1.0_images_train_CLEVR_train_058337.png filter=lfs diff=lfs merge=lfs -text
7582
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/clevr_CLEVR_v1.0_images_train_CLEVR_train_061217.png filter=lfs diff=lfs merge=lfs -text
7583
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/coco_train2017_000000014990.jpg filter=lfs diff=lfs merge=lfs -text
7584
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/csv_202-csv_31.png filter=lfs diff=lfs merge=lfs -text
7585
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/csv_203-csv_346.png filter=lfs diff=lfs merge=lfs -text
7586
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/csv_203-csv_687.png filter=lfs diff=lfs merge=lfs -text
7587
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/csv_204-csv_967.png filter=lfs diff=lfs merge=lfs -text
7588
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/d5c18ad80e53a3a2.jpg filter=lfs diff=lfs merge=lfs -text
7589
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/d625b376a03157b9.jpg filter=lfs diff=lfs merge=lfs -text
7590
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/d9d8a559-8e36-aaaa-2260-828171d65d66_page0.jpg filter=lfs diff=lfs merge=lfs -text
7591
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/design2code_images_12.png filter=lfs diff=lfs merge=lfs -text
7592
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/e3223bf8a2246fd7.jpg filter=lfs diff=lfs merge=lfs -text
7593
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/e46fd80ca8cffc2f6adcf05ea82fe717_page0.jpg filter=lfs diff=lfs merge=lfs -text
7594
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/e8ee95010a24c276.jpg filter=lfs diff=lfs merge=lfs -text
7595
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/eaa4fab593b3f51c.jpg filter=lfs diff=lfs merge=lfs -text
7596
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/ed15edb180c89ed2.jpg filter=lfs diff=lfs merge=lfs -text
7597
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/en__Mersing.png filter=lfs diff=lfs merge=lfs -text
7598
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/en__Northeastern_India.png filter=lfs diff=lfs merge=lfs -text
7599
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/f7e353339bb0138b.jpg filter=lfs diff=lfs merge=lfs -text
7600
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/fd6def9b-5147-4f65-bf4b-5bb33280cfad.jpg filter=lfs diff=lfs merge=lfs -text
7601
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/fhng0227_20.png filter=lfs diff=lfs merge=lfs -text
7602
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/geomverse_00000695.png filter=lfs diff=lfs merge=lfs -text
7603
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/geomverse_00003159.png filter=lfs diff=lfs merge=lfs -text
7604
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/geomverse_00004202.png filter=lfs diff=lfs merge=lfs -text
7605
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/geomverse_00008626.png filter=lfs diff=lfs merge=lfs -text
7606
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/gnvj0223_14.png filter=lfs diff=lfs merge=lfs -text
7607
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/gqa_images_2331056.jpg filter=lfs diff=lfs merge=lfs -text
7608
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/hateful_memes_00000121.png filter=lfs diff=lfs merge=lfs -text
7609
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/hateful_memes_00000946.png filter=lfs diff=lfs merge=lfs -text
7610
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/hateful_memes_00001415.png filter=lfs diff=lfs merge=lfs -text
7611
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/hateful_memes_00004103.png filter=lfs diff=lfs merge=lfs -text
7612
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/hateful_memes_00004395.png filter=lfs diff=lfs merge=lfs -text
7613
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/hateful_memes_00005301.png filter=lfs diff=lfs merge=lfs -text
7614
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/hateful_memes_00006102.png filter=lfs diff=lfs merge=lfs -text
7615
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/hlvj0223_5.png filter=lfs diff=lfs merge=lfs -text
7616
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/hqcb0079_1.png filter=lfs diff=lfs merge=lfs -text
7617
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/human_system_digestive_3671.png filter=lfs diff=lfs merge=lfs -text
7618
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/human_system_muscular_6162.png filter=lfs diff=lfs merge=lfs -text
7619
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/identity_100949.png filter=lfs diff=lfs merge=lfs -text
7620
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/identity_1102.png filter=lfs diff=lfs merge=lfs -text
7621
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/identity_121996.png filter=lfs diff=lfs merge=lfs -text
7622
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/identity_178393.png filter=lfs diff=lfs merge=lfs -text
7623
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/identity_228572.jpg filter=lfs diff=lfs merge=lfs -text
7624
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/identity_233489.png filter=lfs diff=lfs merge=lfs -text
7625
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/identity_295545.png filter=lfs diff=lfs merge=lfs -text
7626
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/image_textualization_000028931.jpg filter=lfs diff=lfs merge=lfs -text
7627
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/image_textualization_000048745.jpg filter=lfs diff=lfs merge=lfs -text
7628
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/image_textualization_000057758.jpg filter=lfs diff=lfs merge=lfs -text
7629
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/image_textualization_000068046.jpg filter=lfs diff=lfs merge=lfs -text
7630
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/infographic_vqa_00001045.png filter=lfs diff=lfs merge=lfs -text
7631
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/infographic_vqa_00001842.png filter=lfs diff=lfs merge=lfs -text
7632
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/information-sheets__schools__screening-pirated-dvds.png filter=lfs diff=lfs merge=lfs -text
7633
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/kzng0227_13.png filter=lfs diff=lfs merge=lfs -text
7634
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/kzng0227_30.png filter=lfs diff=lfs merge=lfs -text
7635
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/lhjh0227_3.png filter=lfs diff=lfs merge=lfs -text
7636
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/mapqa_00014030.png filter=lfs diff=lfs merge=lfs -text
7637
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/mapqa_00015080.png filter=lfs diff=lfs merge=lfs -text
7638
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/mapqa_00018919.png filter=lfs diff=lfs merge=lfs -text
7639
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/mapqa_00019597.png filter=lfs diff=lfs merge=lfs -text
7640
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/mapqa_00020998.png filter=lfs diff=lfs merge=lfs -text
7641
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/mapqa_00021653.png filter=lfs diff=lfs merge=lfs -text
7642
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/mapqa_00022334.png filter=lfs diff=lfs merge=lfs -text
7643
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/mapqa_00022961.png filter=lfs diff=lfs merge=lfs -text
7644
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/nldg0227_16.png filter=lfs diff=lfs merge=lfs -text
7645
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/pkbd0227_2.png filter=lfs diff=lfs merge=lfs -text
7646
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/pzmg0227_1.png filter=lfs diff=lfs merge=lfs -text
7647
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/rendered_text_00000322.png filter=lfs diff=lfs merge=lfs -text
7648
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/rendered_text_00002477.png filter=lfs diff=lfs merge=lfs -text
7649
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/rendered_text_00006606.png filter=lfs diff=lfs merge=lfs -text
7650
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/rendered_text_00006827.png filter=lfs diff=lfs merge=lfs -text
7651
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/rendered_text_00008123.png filter=lfs diff=lfs merge=lfs -text
7652
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/robut_sqa_00000688.png filter=lfs diff=lfs merge=lfs -text
7653
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/robut_sqa_00001761.png filter=lfs diff=lfs merge=lfs -text
7654
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/robut_sqa_00004235.png filter=lfs diff=lfs merge=lfs -text
7655
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/robut_sqa_00005254.png filter=lfs diff=lfs merge=lfs -text
7656
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/robut_sqa_00006488.png filter=lfs diff=lfs merge=lfs -text
7657
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/robut_wikisql_00017148.png filter=lfs diff=lfs merge=lfs -text
7658
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/robut_wikisql_00022345.png filter=lfs diff=lfs merge=lfs -text
7659
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/robut_wikisql_00025558.png filter=lfs diff=lfs merge=lfs -text
7660
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/robut_wikisql_00028965.png filter=lfs diff=lfs merge=lfs -text
7661
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/robut_wikisql_00039057.png filter=lfs diff=lfs merge=lfs -text
7662
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/robut_wikisql_00045126.png filter=lfs diff=lfs merge=lfs -text
7663
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/robut_wikisql_00046516.png filter=lfs diff=lfs merge=lfs -text
7664
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/robut_wikisql_00051301.png filter=lfs diff=lfs merge=lfs -text
7665
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/robut_wikisql_00052683.png filter=lfs diff=lfs merge=lfs -text
7666
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/robut_wikisql_00057151.png filter=lfs diff=lfs merge=lfs -text
7667
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/robut_wikisql_00064796.png filter=lfs diff=lfs merge=lfs -text
7668
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/robut_wikisql_00067312.png filter=lfs diff=lfs merge=lfs -text
7669
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/robut_wikisql_00074000.png filter=lfs diff=lfs merge=lfs -text
7670
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/robut_wtq_00005518.png filter=lfs diff=lfs merge=lfs -text
7671
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/robut_wtq_00014606.png filter=lfs diff=lfs merge=lfs -text
7672
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/robut_wtq_00016168.png filter=lfs diff=lfs merge=lfs -text
7673
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/robut_wtq_00018484.png filter=lfs diff=lfs merge=lfs -text
7674
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/robut_wtq_00018651.png filter=lfs diff=lfs merge=lfs -text
7675
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/robut_wtq_00025796.png filter=lfs diff=lfs merge=lfs -text
7676
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/robut_wtq_00026608.png filter=lfs diff=lfs merge=lfs -text
7677
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/robut_wtq_00027454.png filter=lfs diff=lfs merge=lfs -text
7678
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/robut_wtq_00035291.png filter=lfs diff=lfs merge=lfs -text
7679
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/robut_wtq_00036709.png filter=lfs diff=lfs merge=lfs -text
7680
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/robut_wtq_00037407.png filter=lfs diff=lfs merge=lfs -text
7681
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/rtxm0227_11.png filter=lfs diff=lfs merge=lfs -text
7682
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/sa_14051.jpg filter=lfs diff=lfs merge=lfs -text
7683
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/sa_14818.jpg filter=lfs diff=lfs merge=lfs -text
7684
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/sa_16513.jpg filter=lfs diff=lfs merge=lfs -text
7685
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/sa_16542.jpg filter=lfs diff=lfs merge=lfs -text
7686
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/screen2words_00002430.png filter=lfs diff=lfs merge=lfs -text
7687
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/screen2words_00003580.png filter=lfs diff=lfs merge=lfs -text
7688
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/screen2words_00003808.png filter=lfs diff=lfs merge=lfs -text
7689
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/screen2words_00005136.png filter=lfs diff=lfs merge=lfs -text
7690
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/screen2words_00008652.png filter=lfs diff=lfs merge=lfs -text
7691
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/screen2words_00011020.png filter=lfs diff=lfs merge=lfs -text
7692
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/screen2words_00015281.png filter=lfs diff=lfs merge=lfs -text
7693
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/sharegpt4o_13358.jpg filter=lfs diff=lfs merge=lfs -text
7694
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/sharegpt4o_18904.jpg filter=lfs diff=lfs merge=lfs -text
7695
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/sharegpt4o_26696.jpg filter=lfs diff=lfs merge=lfs -text
7696
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/sharegpt4o_30415.jpg filter=lfs diff=lfs merge=lfs -text
7697
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/sharegpt4o_36245.jpg filter=lfs diff=lfs merge=lfs -text
7698
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/sharegpt4o_54791.jpg filter=lfs diff=lfs merge=lfs -text
7699
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/st_vqa_00006958.png filter=lfs diff=lfs merge=lfs -text
7700
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/st_vqa_00007820.png filter=lfs diff=lfs merge=lfs -text
7701
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/st_vqa_00011034.png filter=lfs diff=lfs merge=lfs -text
7702
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/st_vqa_00012481.png filter=lfs diff=lfs merge=lfs -text
7703
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/st_vqa_00012552.png filter=lfs diff=lfs merge=lfs -text
7704
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/st_vqa_00012968.png filter=lfs diff=lfs merge=lfs -text
7705
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/st_vqa_00014977.png filter=lfs diff=lfs merge=lfs -text
7706
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/st_vqa_00015303.png filter=lfs diff=lfs merge=lfs -text
7707
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/stbm0227_16.png filter=lfs diff=lfs merge=lfs -text
7708
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/tallyqa_00000636.png filter=lfs diff=lfs merge=lfs -text
7709
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/tallyqa_00010487.png filter=lfs diff=lfs merge=lfs -text
7710
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/tallyqa_VG_100K_2322803.jpg filter=lfs diff=lfs merge=lfs -text
7711
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/tallyqa_VG_100K_2_2386823.jpg filter=lfs diff=lfs merge=lfs -text
7712
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/vision-flan_CLEVR_CoGenT+Multiple_Question_Answering+CLEVR_trainA_062588.png filter=lfs diff=lfs merge=lfs -text
7713
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/vision-flan_CLEVR_CoGenT+Question_Answer_Matching+CLEVR_trainA_003809.png filter=lfs diff=lfs merge=lfs -text
7714
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/vision-flan_CLEVR_CoGenT+VQA_context+CLEVR_trainA_029052.png filter=lfs diff=lfs merge=lfs -text
7715
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/vision-flan_Clevr+VQA_context+CLEVR_train_021395.png filter=lfs diff=lfs merge=lfs -text
7716
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/vision-flan_Clevr+VQA_context+CLEVR_train_050800.png filter=lfs diff=lfs merge=lfs -text
7717
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/vision-flan_Dark-Zurich+time_classification+GOPR0345_frame_000691_rgb_anon.jpg filter=lfs diff=lfs merge=lfs -text
7718
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/vision-flan_FGVC_Aircraft+Aircraft_Classification_Family+2235142.jpg filter=lfs diff=lfs merge=lfs -text
7719
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/vision-flan_MEMOTION+sentiment_detection+image_329.jpg filter=lfs diff=lfs merge=lfs -text
7720
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/vision-flan_VQG+question_generation+710487a5-43a2-4f69-9c5b-3b21ac207395_0.jpg filter=lfs diff=lfs merge=lfs -text
7721
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/vision-flan_WIKIART+art_classification+ivan-aivazovsky_dusk-on-the-golden-horn-1845.jpg filter=lfs diff=lfs merge=lfs -text
7722
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/vision-flan_infographicvqa+single_document_question+30989.jpg filter=lfs diff=lfs merge=lfs -text
7723
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/vision-flan_semart+image_school+1683.jpg filter=lfs diff=lfs merge=lfs -text
7724
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/vision-flan_textcaps+caption_generation+20971.jpg filter=lfs diff=lfs merge=lfs -text
7725
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/visual7w_00004696.png filter=lfs diff=lfs merge=lfs -text
7726
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/visual7w_00008404.png filter=lfs diff=lfs merge=lfs -text
7727
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/visual7w_00010373.png filter=lfs diff=lfs merge=lfs -text
7728
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/visual7w_00013878.png filter=lfs diff=lfs merge=lfs -text
7729
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/vsr_00001436.png filter=lfs diff=lfs merge=lfs -text
7730
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/websight_00001692.png filter=lfs diff=lfs merge=lfs -text
7731
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/websight_00002626.png filter=lfs diff=lfs merge=lfs -text
7732
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/wiki__At_least_30_dead_after_bus_crashes_off_cliff_and_into_river_in_South_Africa.png filter=lfs diff=lfs merge=lfs -text
7733
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/wiki__Chile_elects_first_woman_President.png filter=lfs diff=lfs merge=lfs -text
7734
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/xggn0226_15.png filter=lfs diff=lfs merge=lfs -text
7735
+ UMM/Bagel-Med/train_data/bagel_example/vlm/images/xrcy0227_80.png filter=lfs diff=lfs merge=lfs -text
7736
+ UMM/Qwen3-VL/cookbooks/assets/agent_function_call/mobile_en_example.png filter=lfs diff=lfs merge=lfs -text
7737
+ UMM/Qwen3-VL/cookbooks/assets/agent_function_call/mobile_zh_example.png filter=lfs diff=lfs merge=lfs -text
7738
+ UMM/Qwen3-VL/cookbooks/assets/computer_use/computer_use1.jpeg filter=lfs diff=lfs merge=lfs -text
7739
+ UMM/Qwen3-VL/cookbooks/assets/computer_use/computer_use2.jpeg filter=lfs diff=lfs merge=lfs -text
7740
+ UMM/Qwen3-VL/cookbooks/assets/document_parsing/docparsing_example1.jpg filter=lfs diff=lfs merge=lfs -text
7741
+ UMM/Qwen3-VL/cookbooks/assets/document_parsing/docparsing_example2.jpg filter=lfs diff=lfs merge=lfs -text
7742
+ UMM/Qwen3-VL/cookbooks/assets/document_parsing/docparsing_example3.jpg filter=lfs diff=lfs merge=lfs -text
7743
+ UMM/Qwen3-VL/cookbooks/assets/document_parsing/docparsing_example4.jpg filter=lfs diff=lfs merge=lfs -text
7744
+ UMM/Qwen3-VL/cookbooks/assets/document_parsing/docparsing_example5.png filter=lfs diff=lfs merge=lfs -text
7745
+ UMM/Qwen3-VL/cookbooks/assets/document_parsing/docparsing_example6.png filter=lfs diff=lfs merge=lfs -text
7746
+ UMM/Qwen3-VL/cookbooks/assets/document_parsing/docparsing_example7.jpg filter=lfs diff=lfs merge=lfs -text
7747
+ UMM/Qwen3-VL/cookbooks/assets/document_parsing/docparsing_example8.png filter=lfs diff=lfs merge=lfs -text
7748
+ UMM/Qwen3-VL/cookbooks/assets/multimodal_coding/screenshot_demo.png filter=lfs diff=lfs merge=lfs -text
7749
+ UMM/Qwen3-VL/cookbooks/assets/multimodal_coding/sketch2code_input.jpeg filter=lfs diff=lfs merge=lfs -text
7750
+ UMM/Qwen3-VL/cookbooks/assets/ocr/.ipynb_checkpoints/ocr_example1-checkpoint.jpg filter=lfs diff=lfs merge=lfs -text
7751
+ UMM/Qwen3-VL/cookbooks/assets/ocr/.ipynb_checkpoints/ocr_example2-checkpoint.jpg filter=lfs diff=lfs merge=lfs -text
7752
+ UMM/Qwen3-VL/cookbooks/assets/ocr/.ipynb_checkpoints/ocr_example3-checkpoint.jpg filter=lfs diff=lfs merge=lfs -text
7753
+ UMM/Qwen3-VL/cookbooks/assets/ocr/ocr_example1.jpg filter=lfs diff=lfs merge=lfs -text
7754
+ UMM/Qwen3-VL/cookbooks/assets/ocr/ocr_example3.jpg filter=lfs diff=lfs merge=lfs -text
7755
+ UMM/Qwen3-VL/cookbooks/assets/ocr/ocr_example5.jpg filter=lfs diff=lfs merge=lfs -text
7756
+ UMM/Qwen3-VL/cookbooks/assets/omni_recognition/sample-anime-result.jpg filter=lfs diff=lfs merge=lfs -text
7757
+ UMM/Qwen3-VL/cookbooks/assets/omni_recognition/sample-anime.jpeg filter=lfs diff=lfs merge=lfs -text
7758
+ UMM/Qwen3-VL/cookbooks/assets/omni_recognition/sample-bird-result.jpg filter=lfs diff=lfs merge=lfs -text
7759
+ UMM/Qwen3-VL/cookbooks/assets/omni_recognition/sample-bird.jpg filter=lfs diff=lfs merge=lfs -text
7760
+ UMM/Qwen3-VL/cookbooks/assets/omni_recognition/sample-celebrity-result.jpg filter=lfs diff=lfs merge=lfs -text
7761
+ UMM/Qwen3-VL/cookbooks/assets/omni_recognition/sample-celebrity.jpeg filter=lfs diff=lfs merge=lfs -text
7762
+ UMM/Qwen3-VL/cookbooks/assets/omni_recognition/sample-food-result.jpeg filter=lfs diff=lfs merge=lfs -text
7763
+ UMM/Qwen3-VL/cookbooks/assets/omni_recognition/sample-food.jpeg filter=lfs diff=lfs merge=lfs -text
7764
+ UMM/Qwen3-VL/cookbooks/assets/qwenagent/hopinn.jpg filter=lfs diff=lfs merge=lfs -text
7765
+ UMM/Qwen3-VL/cookbooks/assets/spatial_understanding/autonomous_driving.jpg filter=lfs diff=lfs merge=lfs -text
7766
+ UMM/Qwen3-VL/cookbooks/assets/spatial_understanding/dining_table.png filter=lfs diff=lfs merge=lfs -text
7767
+ UMM/Qwen3-VL/cookbooks/assets/spatial_understanding/drone_cars2.png filter=lfs diff=lfs merge=lfs -text
7768
+ UMM/Qwen3-VL/cookbooks/assets/spatial_understanding/football_field.jpg filter=lfs diff=lfs merge=lfs -text
7769
+ UMM/Qwen3-VL/cookbooks/assets/spatial_understanding/lots_of_cars.png filter=lfs diff=lfs merge=lfs -text
7770
+ UMM/Qwen3-VL/cookbooks/assets/spatial_understanding/lots_of_people.jpeg filter=lfs diff=lfs merge=lfs -text
7771
+ UMM/Qwen3-VL/cookbooks/assets/spatial_understanding/office.jpg filter=lfs diff=lfs merge=lfs -text
7772
+ UMM/Qwen3-VL/cookbooks/assets/spatial_understanding/spatio_case2_aff.png filter=lfs diff=lfs merge=lfs -text
7773
+ UMM/Qwen3-VL/cookbooks/assets/spatial_understanding/spatio_case2_aff2.png filter=lfs diff=lfs merge=lfs -text
7774
+ UMM/Qwen3-VL/cookbooks/assets/spatial_understanding/spatio_case2_plan.png filter=lfs diff=lfs merge=lfs -text
7775
+ UMM/Qwen3-VL/cookbooks/assets/spatial_understanding/spatio_case2_plan2.png filter=lfs diff=lfs merge=lfs -text
7776
+ UMM/Qwen3-VL/qwen-vl-finetune/demo/images/COCO_train2014_000000580957.jpg filter=lfs diff=lfs merge=lfs -text
7777
+ UMM/Qwen3-VL/qwen-vl-finetune/demo/videos/v_7bUu05RIksU.mp4 filter=lfs diff=lfs merge=lfs -text
7778
+ UMM/Qwen3-VL/qwen-vl-finetune/demo/videos/v_TpB_zMG3XBA.mp4 filter=lfs diff=lfs merge=lfs -text
7779
+ UMM/Qwen3-VL/qwen-vl-finetune/demo/videos/v_rBMQFpHspmo.mp4 filter=lfs diff=lfs merge=lfs -text
7780
+ UMM/Show-o/docs/characteristic_comparison.png filter=lfs diff=lfs merge=lfs -text
7781
+ UMM/Show-o/docs/demo1.png filter=lfs diff=lfs merge=lfs -text
7782
+ UMM/Show-o/docs/demo2.png filter=lfs diff=lfs merge=lfs -text
7783
+ UMM/Show-o/docs/demo3.png filter=lfs diff=lfs merge=lfs -text
7784
+ UMM/Show-o/docs/github_extrapolation.png filter=lfs diff=lfs merge=lfs -text
7785
+ UMM/Show-o/docs/github_inpainting.png filter=lfs diff=lfs merge=lfs -text
7786
+ UMM/Show-o/docs/github_mmu.png filter=lfs diff=lfs merge=lfs -text
7787
+ UMM/Show-o/docs/github_t2i.png filter=lfs diff=lfs merge=lfs -text
7788
+ UMM/Show-o/docs/overview.png filter=lfs diff=lfs merge=lfs -text
7789
+ UMM/Show-o/docs/show-o-512x512-mmu.png filter=lfs diff=lfs merge=lfs -text
7790
+ UMM/Show-o/docs/show-o-512x512-t2i.png filter=lfs diff=lfs merge=lfs -text
7791
+ UMM/Show-o/docs/show-o-geneval.png filter=lfs diff=lfs merge=lfs -text
7792
+ UMM/Show-o/docs/show-o-want-u.png filter=lfs diff=lfs merge=lfs -text
7793
+ UMM/Show-o/docs/showo.png filter=lfs diff=lfs merge=lfs -text
7794
+ UMM/Show-o/docs/videos/i2v_1.gif filter=lfs diff=lfs merge=lfs -text
7795
+ UMM/Show-o/docs/videos/i2v_2.gif filter=lfs diff=lfs merge=lfs -text
7796
+ UMM/Show-o/docs/videos/i2v_3.gif filter=lfs diff=lfs merge=lfs -text
7797
+ UMM/Show-o/docs/videos/i2v_4.gif filter=lfs diff=lfs merge=lfs -text
7798
+ UMM/Show-o/docs/videos/sky.gif filter=lfs diff=lfs merge=lfs -text
7799
+ UMM/Show-o/docs/videos/waves.gif filter=lfs diff=lfs merge=lfs -text
7800
+ UMM/Show-o/docs/wechat_qa_3.jpg filter=lfs diff=lfs merge=lfs -text
7801
+ UMM/Show-o/inpainting_validation/alpine_lake.jpg filter=lfs diff=lfs merge=lfs -text
7802
+ UMM/Show-o/inpainting_validation/bench.jpg filter=lfs diff=lfs merge=lfs -text
7803
+ UMM/Show-o/inpainting_validation/bus.jpg filter=lfs diff=lfs merge=lfs -text
7804
+ UMM/Show-o/inpainting_validation/maya.png filter=lfs diff=lfs merge=lfs -text
7805
+ UMM/Show-o/inpainting_validation/river.png filter=lfs diff=lfs merge=lfs -text
7806
+ UMM/Show-o/inpainting_validation/train.jpg filter=lfs diff=lfs merge=lfs -text
7807
+ UMM/Show-o/inpainting_validation/truebsee.jpg filter=lfs diff=lfs merge=lfs -text
7808
+ UMM/Show-o/inpainting_validation/wukong1.jpg filter=lfs diff=lfs merge=lfs -text
7809
+ UMM/Show-o/mmu_validation/sofa_under_water.jpg filter=lfs diff=lfs merge=lfs -text
7810
+ UMM/Show-o/show-o2/docs/comparative_analysis.png filter=lfs diff=lfs merge=lfs -text
7811
+ UMM/Show-o/show-o2/docs/demo1.png filter=lfs diff=lfs merge=lfs -text
7812
+ UMM/Show-o/show-o2/docs/demo2.png filter=lfs diff=lfs merge=lfs -text
7813
+ UMM/Show-o/show-o2/docs/demo3.png filter=lfs diff=lfs merge=lfs -text
7814
+ UMM/Show-o/show-o2/docs/mmu/hanjingcenter.jpg filter=lfs diff=lfs merge=lfs -text
7815
+ UMM/Show-o/show-o2/docs/mmu/pexels-fotios-photos-2923436.jpg filter=lfs diff=lfs merge=lfs -text
7816
+ UMM/Show-o/show-o2/docs/mmu/pexels-jane-pham-727419-1571673.jpg filter=lfs diff=lfs merge=lfs -text
7817
+ UMM/Show-o/show-o2/docs/mmu/pexels-mccutcheon-1148998.jpg filter=lfs diff=lfs merge=lfs -text
7818
+ UMM/Show-o/show-o2/docs/mmu/pexels-muffin-1558665.jpg filter=lfs diff=lfs merge=lfs -text
7819
+ UMM/Show-o/show-o2/docs/mmu/pexels-pixabay-207983.jpg filter=lfs diff=lfs merge=lfs -text
7820
+ UMM/Show-o/show-o2/docs/mmu/pexels-psco-1071882.jpg filter=lfs diff=lfs merge=lfs -text
7821
+ UMM/Show-o/show-o2/docs/mmu/pexels-talha-ahmed-26040377-7949588.jpg filter=lfs diff=lfs merge=lfs -text
7822
+ UMM/Show-o/show-o2/docs/mmu/pexels-taryn-elliott-4144459.jpg filter=lfs diff=lfs merge=lfs -text
7823
+ UMM/Show-o/show-o2/docs/mmu/pexels-thelazyartist-1117485.jpg filter=lfs diff=lfs merge=lfs -text
7824
+ UMM/Show-o/show-o2/docs/overview.png filter=lfs diff=lfs merge=lfs -text
7825
+ UMM/Show-o/show-o2/docs/videos/i2v_1.gif filter=lfs diff=lfs merge=lfs -text
7826
+ UMM/Show-o/show-o2/docs/videos/i2v_2.gif filter=lfs diff=lfs merge=lfs -text
7827
+ UMM/Show-o/show-o2/docs/videos/i2v_3.gif filter=lfs diff=lfs merge=lfs -text
7828
+ UMM/Show-o/show-o2/docs/videos/i2v_4.gif filter=lfs diff=lfs merge=lfs -text
7829
+ UMM/Show-o/show-o2/docs/videos/sky.gif filter=lfs diff=lfs merge=lfs -text
7830
+ UMM/Show-o/show-o2/docs/videos/video1.mp4 filter=lfs diff=lfs merge=lfs -text
7831
+ UMM/Show-o/show-o2/docs/videos/video2.mp4 filter=lfs diff=lfs merge=lfs -text
7832
+ UMM/Show-o/show-o2/docs/videos/waves.gif filter=lfs diff=lfs merge=lfs -text
7833
+ UMM/Show-o/training/questions.json filter=lfs diff=lfs merge=lfs -text
7834
+ UMM/UniVideo/assets/image.png filter=lfs diff=lfs merge=lfs -text
7835
+ UMM/UniVideo/assets/teaser.gif filter=lfs diff=lfs merge=lfs -text
7836
+ UMM/UniVideo/assets/teaser.jpg filter=lfs diff=lfs merge=lfs -text
7837
+ UMM/UniVideo/demo/i2v/1.png filter=lfs diff=lfs merge=lfs -text
7838
+ UMM/UniVideo/demo/i2v/output.gif filter=lfs diff=lfs merge=lfs -text
7839
+ UMM/UniVideo/demo/i2v/output.mp4 filter=lfs diff=lfs merge=lfs -text
7840
+ UMM/UniVideo/demo/in_context_image_edit/input.jpg filter=lfs diff=lfs merge=lfs -text
7841
+ UMM/UniVideo/demo/in_context_video_edit/id_addition/output.gif filter=lfs diff=lfs merge=lfs -text
7842
+ UMM/UniVideo/demo/in_context_video_edit/id_addition/output.mp4 filter=lfs diff=lfs merge=lfs -text
7843
+ UMM/UniVideo/demo/in_context_video_edit/id_addition/reference.gif filter=lfs diff=lfs merge=lfs -text
7844
+ UMM/UniVideo/demo/in_context_video_edit/id_addition/reference.mp4 filter=lfs diff=lfs merge=lfs -text
7845
+ UMM/UniVideo/demo/in_context_video_edit/id_swap/ID.jpeg filter=lfs diff=lfs merge=lfs -text
7846
+ UMM/UniVideo/demo/in_context_video_edit/id_swap/origin.mp4 filter=lfs diff=lfs merge=lfs -text
7847
+ UMM/UniVideo/demo/in_context_video_edit/id_swap/output.mp4 filter=lfs diff=lfs merge=lfs -text
7848
+ UMM/UniVideo/demo/in_context_video_edit/style/output.mp4 filter=lfs diff=lfs merge=lfs -text
7849
+ UMM/UniVideo/demo/in_context_video_edit/style/ref.jpg filter=lfs diff=lfs merge=lfs -text
7850
+ UMM/UniVideo/demo/in_context_video_edit/style/video.mp4 filter=lfs diff=lfs merge=lfs -text
7851
+ UMM/UniVideo/demo/in_context_video_gen/1.png filter=lfs diff=lfs merge=lfs -text
7852
+ UMM/UniVideo/demo/in_context_video_gen/2.png filter=lfs diff=lfs merge=lfs -text
7853
+ UMM/UniVideo/demo/in_context_video_gen/3.jpg filter=lfs diff=lfs merge=lfs -text
7854
+ UMM/UniVideo/demo/in_context_video_gen/output.gif filter=lfs diff=lfs merge=lfs -text
7855
+ UMM/UniVideo/demo/in_context_video_gen/output.mp4 filter=lfs diff=lfs merge=lfs -text
7856
+ UMM/UniVideo/demo/t2i/output.jpg filter=lfs diff=lfs merge=lfs -text
7857
+ UMM/UniVideo/demo/t2v/output.gif filter=lfs diff=lfs merge=lfs -text
7858
+ UMM/UniVideo/demo/t2v/output.mp4 filter=lfs diff=lfs merge=lfs -text
7859
+ UMM/UniVideo/demo/understanding/1.mp4 filter=lfs diff=lfs merge=lfs -text
7860
+ UMM/UniVideo/demo/understanding/input.jpg filter=lfs diff=lfs merge=lfs -text
7861
+ UMM/UniVideo/demo/video_edit/output.mp4 filter=lfs diff=lfs merge=lfs -text
7862
+ UMM/UniVideo/demo/video_edit/style/output.gif filter=lfs diff=lfs merge=lfs -text
7863
+ UMM/UniVideo/demo/video_edit/style/output.mp4 filter=lfs diff=lfs merge=lfs -text
7864
+ UMM/UniVideo/demo/video_edit/video.gif filter=lfs diff=lfs merge=lfs -text
7865
+ UMM/UniVideo/demo/video_edit/video.mp4 filter=lfs diff=lfs merge=lfs -text
7866
+ UMM/UniVideo-Med/assets/image.png filter=lfs diff=lfs merge=lfs -text
7867
+ UMM/UniVideo-Med/assets/teaser.gif filter=lfs diff=lfs merge=lfs -text
7868
+ UMM/UniVideo-Med/assets/teaser.jpg filter=lfs diff=lfs merge=lfs -text
7869
+ UMM/UniVideo-Med/demo/i2v/1.png filter=lfs diff=lfs merge=lfs -text
7870
+ UMM/UniVideo-Med/demo/i2v/output.gif filter=lfs diff=lfs merge=lfs -text
7871
+ UMM/UniVideo-Med/demo/i2v/output.mp4 filter=lfs diff=lfs merge=lfs -text
7872
+ UMM/UniVideo-Med/demo/in_context_image_edit/input.jpg filter=lfs diff=lfs merge=lfs -text
7873
+ UMM/UniVideo-Med/demo/in_context_video_edit/id_addition/output.gif filter=lfs diff=lfs merge=lfs -text
7874
+ UMM/UniVideo-Med/demo/in_context_video_edit/id_addition/output.mp4 filter=lfs diff=lfs merge=lfs -text
7875
+ UMM/UniVideo-Med/demo/in_context_video_edit/id_addition/reference.gif filter=lfs diff=lfs merge=lfs -text
7876
+ UMM/UniVideo-Med/demo/in_context_video_edit/id_addition/reference.mp4 filter=lfs diff=lfs merge=lfs -text
7877
+ UMM/UniVideo-Med/demo/in_context_video_edit/id_swap/ID.jpeg filter=lfs diff=lfs merge=lfs -text
7878
+ UMM/UniVideo-Med/demo/in_context_video_edit/id_swap/origin.mp4 filter=lfs diff=lfs merge=lfs -text
7879
+ UMM/UniVideo-Med/demo/in_context_video_edit/id_swap/output.mp4 filter=lfs diff=lfs merge=lfs -text
7880
+ UMM/UniVideo-Med/demo/in_context_video_edit/style/output.mp4 filter=lfs diff=lfs merge=lfs -text
7881
+ UMM/UniVideo-Med/demo/in_context_video_edit/style/ref.jpg filter=lfs diff=lfs merge=lfs -text
7882
+ UMM/UniVideo-Med/demo/in_context_video_edit/style/video.mp4 filter=lfs diff=lfs merge=lfs -text
7883
+ UMM/UniVideo-Med/demo/in_context_video_gen/1.png filter=lfs diff=lfs merge=lfs -text
7884
+ UMM/UniVideo-Med/demo/in_context_video_gen/2.png filter=lfs diff=lfs merge=lfs -text
7885
+ UMM/UniVideo-Med/demo/in_context_video_gen/3.jpg filter=lfs diff=lfs merge=lfs -text
7886
+ UMM/UniVideo-Med/demo/in_context_video_gen/output.gif filter=lfs diff=lfs merge=lfs -text
7887
+ UMM/UniVideo-Med/demo/in_context_video_gen/output.mp4 filter=lfs diff=lfs merge=lfs -text
7888
+ UMM/UniVideo-Med/demo/t2i/output.jpg filter=lfs diff=lfs merge=lfs -text
7889
+ UMM/UniVideo-Med/demo/t2v/output.gif filter=lfs diff=lfs merge=lfs -text
7890
+ UMM/UniVideo-Med/demo/t2v/output.mp4 filter=lfs diff=lfs merge=lfs -text
7891
+ UMM/UniVideo-Med/demo/understanding/1.mp4 filter=lfs diff=lfs merge=lfs -text
7892
+ UMM/UniVideo-Med/demo/understanding/input.jpg filter=lfs diff=lfs merge=lfs -text
7893
+ UMM/UniVideo-Med/demo/video_edit/output.mp4 filter=lfs diff=lfs merge=lfs -text
7894
+ UMM/UniVideo-Med/demo/video_edit/style/output.gif filter=lfs diff=lfs merge=lfs -text
7895
+ UMM/UniVideo-Med/demo/video_edit/style/output.mp4 filter=lfs diff=lfs merge=lfs -text
7896
+ UMM/UniVideo-Med/demo/video_edit/video.gif filter=lfs diff=lfs merge=lfs -text
7897
+ UMM/UniVideo-Med/demo/video_edit/video.mp4 filter=lfs diff=lfs merge=lfs -text
7898
+ UMM/UniVideo-Med/sharegpt4video_40k.jsonl filter=lfs diff=lfs merge=lfs -text
7899
+ UMM/unsloth/images/Where_Terminal.png filter=lfs diff=lfs merge=lfs -text
7900
+ UMM/unsloth/images/unsloth[[:space:]]end.png filter=lfs diff=lfs merge=lfs -text
7901
+ UMM/unsloth/images/unsloth[[:space:]]loading[[:space:]]page[[:space:]]render.png filter=lfs diff=lfs merge=lfs -text
7902
+ UMM/unsloth/images/unsloth[[:space:]]logo[[:space:]]black[[:space:]]text.png filter=lfs diff=lfs merge=lfs -text
7903
+ UMM/unsloth/images/unsloth[[:space:]]logo[[:space:]]white[[:space:]]text.png filter=lfs diff=lfs merge=lfs -text
7904
+ UMM/unsloth/images/unsloth[[:space:]]sticker.png filter=lfs diff=lfs merge=lfs -text
UMM/BLIP3o-Qwen3-Siglip2/README.md ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ ## Qwen3 + SigLIP2 + EVACLIP
3
+
4
+ This branch combines:
5
+ - **Qwen3** as the autoregressive backbone (You can choose any size Qwen3 model, 0.6B, 1.7B, 4B, 8B, 14B, 32B)
6
+ - **SigLIP2** for image understanding vision encoder
7
+ - **EVACLIP** for image generation vision encoder
8
+
9
+ You can set up and run this in the same environment as the `main` branch.
10
+
11
+ ### Available Training Modes
12
+ - **Image Understanding (I2T)**
13
+ - **Image Generation (T2I)**
14
+ - **Joint Training** (both tasks)
15
+
16
+
17
+ To choose different training tasks, update the dataloader in `train.py`:
18
+ - Image generation data [https://github.com/JiuhaiChen/BLIP3o/blob/Qwen3-Siglip2/blip3o/train/train.py#L498]
19
+ - Image understanding data [https://github.com/JiuhaiChen/BLIP3o/blob/Qwen3-Siglip2/blip3o/train/train.py#L512]
20
+
21
+
22
+ Specific data type markers in the script:
23
+ - **T2I** (Text-to-Image) [https://github.com/JiuhaiChen/BLIP3o/blob/Qwen3-Siglip2/blip3o/train/train.py#L503]
24
+ - **I2T** (Image-to-Text) [https://github.com/JiuhaiChen/BLIP3o/blob/Qwen3-Siglip2/blip3o/train/train.py#L517]
25
+
26
+
27
+ ### Freezing the Backbone
28
+ - Add `--freeze_backbone True` in the training script to freeze Qwen3 during training
29
+ - Add `--freeze_backbone False` in the training script to unfreeze Qwen3 during training (we recommend unfreeze backbone when you train image understanding tasks)
30
+
31
+ ### Adjust your batch size according to your GPU setup!
UMM/BLIP3o-Qwen3-Siglip2/blip3o/.DS_Store ADDED
Binary file (6.15 kB). View file
 
UMM/BLIP3o-Qwen3-Siglip2/blip3o/__init__.py ADDED
File without changes
UMM/BLIP3o-Qwen3-Siglip2/blip3o/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (156 Bytes). View file
 
UMM/BLIP3o-Qwen3-Siglip2/blip3o/__pycache__/constants.cpython-311.pyc ADDED
Binary file (669 Bytes). View file
 
UMM/BLIP3o-Qwen3-Siglip2/blip3o/__pycache__/conversation.cpython-311.pyc ADDED
Binary file (19.6 kB). View file
 
UMM/BLIP3o-Qwen3-Siglip2/blip3o/__pycache__/mm_utils.cpython-311.pyc ADDED
Binary file (14.6 kB). View file
 
UMM/BLIP3o-Qwen3-Siglip2/blip3o/__pycache__/utils.cpython-311.pyc ADDED
Binary file (6.94 kB). View file
 
UMM/BLIP3o-Qwen3-Siglip2/blip3o/constants.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ CONTROLLER_HEART_BEAT_EXPIRATION = 30
2
+ WORKER_HEART_BEAT_INTERVAL = 15
3
+
4
+ LOGDIR = "."
5
+
6
+ # Model Constants
7
+ IGNORE_INDEX = -100
8
+ # IMAGE_TOKEN_INDEX = -200
9
+
10
+ DEFAULT_IMAGE_TOKEN = "<image>"
11
+ DEFAULT_IM_START_TOKEN = "[IMG]"
12
+ DEFAULT_IM_END_TOKEN = "[/IMG]"
13
+
14
+
15
+ # IMAGE_TOKEN_IDX = 32002
16
+ # DEFAULT_IM_START_TOKEN_IDX = 32000
17
+ # DEFAULT_IM_END_TOKEN_IDX = 32001
18
+
19
+ IMAGE_TOKEN_IDX = 151655
20
+ DEFAULT_IM_START_TOKEN_IDX = 151669
21
+ DEFAULT_IM_END_TOKEN_IDX = 151670
22
+ UND_IMAGE_TOKEN_IDX = 151671
23
+ # N_QUERY = 729
24
+
25
+ DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
26
+ IMAGE_PLACEHOLDER = "<image-placeholder>"
UMM/BLIP3o-Qwen3-Siglip2/blip3o/conversation.py ADDED
@@ -0,0 +1,476 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import dataclasses
2
+ from enum import auto, Enum
3
+ from typing import List, Tuple
4
+ import base64
5
+ from io import BytesIO
6
+ from PIL import Image
7
+
8
+
9
+ class SeparatorStyle(Enum):
10
+ """Different separator style."""
11
+ SINGLE = auto()
12
+ TWO = auto()
13
+ MPT = auto()
14
+ PLAIN = auto()
15
+ LLAMA_2 = auto()
16
+ CHATML = auto()
17
+ QWEN = auto()
18
+
19
+
20
+ @dataclasses.dataclass
21
+ class Conversation:
22
+ """A class that keeps all conversation history."""
23
+ system: str
24
+ roles: List[str]
25
+ messages: List[List[str]]
26
+ offset: int
27
+ sep_style: SeparatorStyle = SeparatorStyle.SINGLE
28
+ sep: str = "###"
29
+ sep2: str = None
30
+ version: str = "Unknown"
31
+
32
+ skip_next: bool = False
33
+
34
+ def get_prompt(self):
35
+ messages = self.messages
36
+ if len(messages) > 0 and type(messages[0][1]) is tuple:
37
+ messages = self.messages.copy()
38
+ init_role, init_msg = messages[0].copy()
39
+ init_msg = init_msg[0]
40
+ if "mmtag" in self.version:
41
+ init_msg = init_msg.replace("<image>", "").strip()
42
+ messages[0] = (init_role, init_msg)
43
+ messages.insert(0, (self.roles[0], "<Image><image></Image>"))
44
+ messages.insert(1, (self.roles[1], "Received."))
45
+ elif not init_msg.startswith("<image>"):
46
+ init_msg = init_msg.replace("<image>", "").strip()
47
+ messages[0] = (init_role, "<image>\n" + init_msg)
48
+ else:
49
+ messages[0] = (init_role, init_msg)
50
+
51
+ if self.sep_style == SeparatorStyle.SINGLE:
52
+ ret = self.system + self.sep
53
+ for role, message in messages:
54
+ if message:
55
+ if type(message) is tuple:
56
+ message, _, _ = message
57
+ ret += role + ": " + message + self.sep
58
+ else:
59
+ ret += role + ":"
60
+
61
+ elif self.sep_style == SeparatorStyle.TWO:
62
+ seps = [self.sep, self.sep2]
63
+ ret = self.system + seps[0]
64
+ for i, (role, message) in enumerate(messages):
65
+ if message:
66
+ if type(message) is tuple:
67
+ message, _, _ = message
68
+ ret += role + ": " + message + seps[i % 2]
69
+ else:
70
+ ret += role + ":"
71
+
72
+ elif self.sep_style == SeparatorStyle.CHATML:
73
+ ret = "" if self.system == "" else self.system + self.sep + "\n"
74
+ for role, message in messages:
75
+ if message:
76
+ if type(message) is tuple:
77
+ message, images, _ = message
78
+ message = "<image>" * len(images) + message
79
+ ret += role + "\n" + message + self.sep + "\n"
80
+ else:
81
+ ret += role + "\n"
82
+ return ret
83
+
84
+ elif self.sep_style == SeparatorStyle.LLAMA_3:
85
+ if self.tokenizer is None:
86
+ raise ValueError("Llama 3 tokenizer is not available. Make sure you have the necessary permissions.")
87
+ chat_template_messages = [{"role": "system", "content": self.system}]
88
+ for role, message in messages:
89
+ if message:
90
+ if type(message) is tuple:
91
+ message, images = message
92
+ message = "<image>" * len(images) + message
93
+ chat_template_messages.append({"role": role, "content": message})
94
+
95
+ # print(chat_template_messages)
96
+ return self.tokenizer.apply_chat_template(chat_template_messages, tokenize=False, add_generation_prompt=True)
97
+ # ret = "" if self.system == "" else self.system + self.sep + "\n"
98
+ # for role, message in messages:
99
+ # if message:
100
+ # if type(message) is tuple:
101
+ # message, images = message
102
+ # message = "<image>" * len(images) + message
103
+ # ret += role + "\n" + message + self.sep + "\n"
104
+ # else:
105
+ # ret += role + "\n"
106
+ # return ret
107
+
108
+ elif self.sep_style == SeparatorStyle.MPT:
109
+ ret = self.system + self.sep
110
+ for role, message in messages:
111
+ if message:
112
+ if type(message) is tuple:
113
+ message, _, _ = message
114
+ ret += role + message + self.sep
115
+ else:
116
+ ret += role
117
+
118
+ elif self.sep_style == SeparatorStyle.GEMMA:
119
+ ret = ""
120
+ for i, (role, message) in enumerate(messages):
121
+ assert role == self.roles[i % 2], "Conversation should alternate user/assistant/user/assistant/..."
122
+ if message:
123
+ if type(message) is tuple:
124
+ message, _, _ = message
125
+ ret += role + message + self.sep
126
+ else:
127
+ ret += role
128
+
129
+ elif self.sep_style == SeparatorStyle.LLAMA_2:
130
+ wrap_sys = lambda msg: f"<<SYS>>\n{msg}\n<</SYS>>\n\n" if len(msg) > 0 else msg
131
+ wrap_inst = lambda msg: f"[INST] {msg} [/INST]"
132
+ ret = ""
133
+
134
+ for i, (role, message) in enumerate(messages):
135
+ if i == 0:
136
+ assert message, "first message should not be none"
137
+ assert role == self.roles[0], "first message should come from user"
138
+ if message:
139
+ if type(message) is tuple:
140
+ message, _, _ = message
141
+ if i == 0:
142
+ message = wrap_sys(self.system) + message
143
+ if i % 2 == 0:
144
+ message = wrap_inst(message)
145
+ ret += self.sep + message
146
+ else:
147
+ ret += " " + message + " " + self.sep2
148
+ else:
149
+ ret += ""
150
+ ret = ret.lstrip(self.sep)
151
+
152
+ elif self.sep_style == SeparatorStyle.PLAIN:
153
+ seps = [self.sep, self.sep2]
154
+ ret = self.system
155
+ for i, (role, message) in enumerate(messages):
156
+ if message:
157
+ if type(message) is tuple:
158
+ message, _, _ = message
159
+ ret += message + seps[i % 2]
160
+ else:
161
+ ret += ""
162
+ else:
163
+ raise ValueError(f"Invalid style: {self.sep_style}")
164
+
165
+ return ret
166
+
167
+ def append_message(self, role, message):
168
+ self.messages.append([role, message])
169
+
170
+ def process_image(self, image, image_process_mode, return_pil=False, image_format='PNG', max_len=1344, min_len=672):
171
+ if image_process_mode == "Pad":
172
+ def expand2square(pil_img, background_color=(122, 116, 104)):
173
+ width, height = pil_img.size
174
+ if width == height:
175
+ return pil_img
176
+ elif width > height:
177
+ result = Image.new(pil_img.mode, (width, width), background_color)
178
+ result.paste(pil_img, (0, (width - height) // 2))
179
+ return result
180
+ else:
181
+ result = Image.new(pil_img.mode, (height, height), background_color)
182
+ result.paste(pil_img, ((height - width) // 2, 0))
183
+ return result
184
+
185
+ image = expand2square(image)
186
+ elif image_process_mode in ["Default", "Crop"]:
187
+ pass
188
+ elif image_process_mode == "Resize":
189
+ image = image.resize((336, 336))
190
+ else:
191
+ raise ValueError(f"Invalid image_process_mode: {image_process_mode}")
192
+ if max(image.size) > max_len:
193
+ max_hw, min_hw = max(image.size), min(image.size)
194
+ aspect_ratio = max_hw / min_hw
195
+ shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw))
196
+ longest_edge = int(shortest_edge * aspect_ratio)
197
+ W, H = image.size
198
+ if H > W:
199
+ H, W = longest_edge, shortest_edge
200
+ else:
201
+ H, W = shortest_edge, longest_edge
202
+ image = image.resize((W, H))
203
+ if return_pil:
204
+ return image
205
+ else:
206
+ buffered = BytesIO()
207
+ image.save(buffered, format=image_format)
208
+ img_b64_str = base64.b64encode(buffered.getvalue()).decode()
209
+ return img_b64_str
210
+
211
+ def get_images(self, return_pil=False):
212
+ images = []
213
+ for i, (role, msg) in enumerate(self.messages[self.offset:]):
214
+ if i % 2 == 0:
215
+ if type(msg) is tuple:
216
+ msg, image, image_process_mode = msg
217
+ image = self.process_image(image, image_process_mode, return_pil=return_pil)
218
+ images.append(image)
219
+ return images
220
+
221
+ def to_gradio_chatbot(self):
222
+ ret = []
223
+ for i, (role, msg) in enumerate(self.messages[self.offset:]):
224
+ if i % 2 == 0:
225
+ if type(msg) is tuple:
226
+ msg, image, image_process_mode = msg
227
+ img_b64_str = self.process_image(
228
+ image, "Default", return_pil=False,
229
+ image_format='JPEG')
230
+ img_str = f'<img src="data:image/jpeg;base64,{img_b64_str}" alt="user upload image" />'
231
+ msg = img_str + msg.replace('<image>', '').strip()
232
+ ret.append([msg, None])
233
+ else:
234
+ ret.append([msg, None])
235
+ else:
236
+ ret[-1][-1] = msg
237
+ return ret
238
+
239
+ def copy(self):
240
+ return Conversation(
241
+ system=self.system,
242
+ roles=self.roles,
243
+ messages=[[x, y] for x, y in self.messages],
244
+ offset=self.offset,
245
+ sep_style=self.sep_style,
246
+ sep=self.sep,
247
+ sep2=self.sep2,
248
+ version=self.version)
249
+
250
+ def dict(self):
251
+ if len(self.get_images()) > 0:
252
+ return {
253
+ "system": self.system,
254
+ "roles": self.roles,
255
+ "messages": [[x, y[0] if type(y) is tuple else y] for x, y in self.messages],
256
+ "offset": self.offset,
257
+ "sep": self.sep,
258
+ "sep2": self.sep2,
259
+ }
260
+ return {
261
+ "system": self.system,
262
+ "roles": self.roles,
263
+ "messages": self.messages,
264
+ "offset": self.offset,
265
+ "sep": self.sep,
266
+ "sep2": self.sep2,
267
+ }
268
+
269
+
270
+ conv_vicuna_v0 = Conversation(
271
+ system="A chat between a curious human and an artificial intelligence assistant. "
272
+ "The assistant gives helpful, detailed, and polite answers to the human's questions.",
273
+ roles=("Human", "Assistant"),
274
+ messages=(
275
+ ("Human", "What are the key differences between renewable and non-renewable energy sources?"),
276
+ ("Assistant",
277
+ "Renewable energy sources are those that can be replenished naturally in a relatively "
278
+ "short amount of time, such as solar, wind, hydro, geothermal, and biomass. "
279
+ "Non-renewable energy sources, on the other hand, are finite and will eventually be "
280
+ "depleted, such as coal, oil, and natural gas. Here are some key differences between "
281
+ "renewable and non-renewable energy sources:\n"
282
+ "1. Availability: Renewable energy sources are virtually inexhaustible, while non-renewable "
283
+ "energy sources are finite and will eventually run out.\n"
284
+ "2. Environmental impact: Renewable energy sources have a much lower environmental impact "
285
+ "than non-renewable sources, which can lead to air and water pollution, greenhouse gas emissions, "
286
+ "and other negative effects.\n"
287
+ "3. Cost: Renewable energy sources can be more expensive to initially set up, but they typically "
288
+ "have lower operational costs than non-renewable sources.\n"
289
+ "4. Reliability: Renewable energy sources are often more reliable and can be used in more remote "
290
+ "locations than non-renewable sources.\n"
291
+ "5. Flexibility: Renewable energy sources are often more flexible and can be adapted to different "
292
+ "situations and needs, while non-renewable sources are more rigid and inflexible.\n"
293
+ "6. Sustainability: Renewable energy sources are more sustainable over the long term, while "
294
+ "non-renewable sources are not, and their depletion can lead to economic and social instability.\n")
295
+ ),
296
+ offset=2,
297
+ sep_style=SeparatorStyle.SINGLE,
298
+ sep="###",
299
+ )
300
+
301
+ conv_vicuna_v1 = Conversation(
302
+ system="A chat between a curious user and an artificial intelligence assistant. "
303
+ "The assistant gives helpful, detailed, and polite answers to the user's questions.",
304
+ roles=("USER", "ASSISTANT"),
305
+ version="v1",
306
+ messages=(),
307
+ offset=0,
308
+ sep_style=SeparatorStyle.TWO,
309
+ sep=" ",
310
+ sep2="</s>",
311
+ )
312
+
313
+ conv_llama_2 = Conversation(
314
+ system="""You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.
315
+
316
+ If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.""",
317
+ roles=("USER", "ASSISTANT"),
318
+ version="llama_v2",
319
+ messages=(),
320
+ offset=0,
321
+ sep_style=SeparatorStyle.LLAMA_2,
322
+ sep="<s>",
323
+ sep2="</s>",
324
+ )
325
+
326
+
327
+ conv_blip3o_llama_2 = Conversation(
328
+ system="You are a helpful language and vision assistant. "
329
+ "You are able to understand the visual content that the user provides, "
330
+ "and assist the user with a variety of tasks using natural language.",
331
+ roles=("USER", "ASSISTANT"),
332
+ version="llama_v2",
333
+ messages=(),
334
+ offset=0,
335
+ sep_style=SeparatorStyle.LLAMA_2,
336
+ sep="<s>",
337
+ sep2="</s>",
338
+ )
339
+
340
+ conv_mpt = Conversation(
341
+ system="""<|im_start|>system
342
+ A conversation between a user and an LLM-based AI assistant. The assistant gives helpful and honest answers.""",
343
+ roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
344
+ version="mpt",
345
+ messages=(),
346
+ offset=0,
347
+ sep_style=SeparatorStyle.MPT,
348
+ sep="<|im_end|>",
349
+ )
350
+
351
+ conv_blip3o_plain = Conversation(
352
+ system="",
353
+ roles=("", ""),
354
+ messages=(
355
+ ),
356
+ offset=0,
357
+ sep_style=SeparatorStyle.PLAIN,
358
+ sep="\n",
359
+ )
360
+
361
+ conv_blip3o_v0 = Conversation(
362
+ system="A chat between a curious human and an artificial intelligence assistant. "
363
+ "The assistant gives helpful, detailed, and polite answers to the human's questions.",
364
+ roles=("Human", "Assistant"),
365
+ messages=(
366
+ ),
367
+ offset=0,
368
+ sep_style=SeparatorStyle.SINGLE,
369
+ sep="###",
370
+ )
371
+
372
+ conv_blip3o_v0_mmtag = Conversation(
373
+ system="A chat between a curious user and an artificial intelligence assistant. "
374
+ "The assistant is able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language."
375
+ "The visual content will be provided with the following format: <Image>visual content</Image>.",
376
+ roles=("Human", "Assistant"),
377
+ messages=(
378
+ ),
379
+ offset=0,
380
+ sep_style=SeparatorStyle.SINGLE,
381
+ sep="###",
382
+ version="v0_mmtag",
383
+ )
384
+
385
+ conv_blip3o_v1 = Conversation(
386
+ system="A chat between a curious human and an artificial intelligence assistant. "
387
+ "The assistant gives helpful, detailed, and polite answers to the human's questions.",
388
+ roles=("USER", "ASSISTANT"),
389
+ version="v1",
390
+ messages=(),
391
+ offset=0,
392
+ sep_style=SeparatorStyle.TWO,
393
+ sep=" ",
394
+ sep2="</s>",
395
+ )
396
+
397
+ conv_blip3o_v1_mmtag = Conversation(
398
+ system="A chat between a curious user and an artificial intelligence assistant. "
399
+ "The assistant is able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language."
400
+ "The visual content will be provided with the following format: <Image>visual content</Image>.",
401
+ roles=("USER", "ASSISTANT"),
402
+ messages=(),
403
+ offset=0,
404
+ sep_style=SeparatorStyle.TWO,
405
+ sep=" ",
406
+ sep2="</s>",
407
+ version="v1_mmtag",
408
+ )
409
+
410
+ conv_mistral_instruct = Conversation(
411
+ system="",
412
+ roles=("USER", "ASSISTANT"),
413
+ version="llama_v2",
414
+ messages=(),
415
+ offset=0,
416
+ sep_style=SeparatorStyle.LLAMA_2,
417
+ sep="",
418
+ sep2="</s>",
419
+ )
420
+
421
+ conv_chatml_direct = Conversation(
422
+ system="""<|im_start|>system
423
+ Answer the questions.""",
424
+ roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
425
+ version="mpt",
426
+ messages=(),
427
+ offset=0,
428
+ sep_style=SeparatorStyle.MPT,
429
+ sep="<|im_end|>",
430
+ )
431
+
432
+ conv_llama3 = Conversation(
433
+ system="""<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nA chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.""",
434
+ roles=("<|start_header_id|>user<|end_header_id|>\n\n", "<|start_header_id|>assistant<|end_header_id|>\n\n"),
435
+ version="llama3",
436
+ messages=(),
437
+ offset=0,
438
+ sep_style=SeparatorStyle.MPT,
439
+ sep="<|eot_id|>",
440
+ )
441
+
442
+ conv_qwen = Conversation(
443
+ system="""<|im_start|>system
444
+ You are a helpful assistant.""",
445
+ roles=("<|im_start|>user", "<|im_start|>assistant"),
446
+ version="qwen",
447
+ messages=[],
448
+ offset=0,
449
+ sep_style=SeparatorStyle.CHATML,
450
+ sep="<|im_end|>",
451
+ )
452
+
453
+
454
+ default_conversation = conv_llama3
455
+ conv_templates = {
456
+ "default": conv_vicuna_v0,
457
+ "v0": conv_vicuna_v0,
458
+ "v1": conv_vicuna_v1,
459
+ "vicuna_v1": conv_vicuna_v1,
460
+ "llama_2": conv_llama_2,
461
+ "mistral_instruct": conv_mistral_instruct,
462
+ "chatml_direct": conv_chatml_direct,
463
+ "mistral_direct": conv_chatml_direct,
464
+ "plain": conv_blip3o_plain,
465
+ "v0_plain": conv_blip3o_plain,
466
+ "blip3o_v0": conv_blip3o_v0,
467
+ "v0_mmtag": conv_blip3o_v0_mmtag,
468
+ "blip3o_v1": conv_blip3o_v1,
469
+ "v1_mmtag": conv_blip3o_v1_mmtag,
470
+ "llama3": conv_llama3,
471
+ "qwen": conv_qwen,
472
+ "mpt": conv_mpt,
473
+ }
474
+
475
+ if __name__ == "__main__":
476
+ print(default_conversation.get_prompt())
UMM/BLIP3o-Qwen3-Siglip2/blip3o/mm_utils.py ADDED
@@ -0,0 +1,247 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from PIL import Image
2
+ from io import BytesIO
3
+ import base64
4
+ import torch
5
+ import math
6
+ import ast
7
+
8
+ from transformers import StoppingCriteria
9
+ from blip3o.constants import IMAGE_TOKEN_IDX
10
+
11
+
12
+ def select_best_resolution(original_size, possible_resolutions):
13
+ """
14
+ Selects the best resolution from a list of possible resolutions based on the original size.
15
+
16
+ Args:
17
+ original_size (tuple): The original size of the image in the format (width, height).
18
+ possible_resolutions (list): A list of possible resolutions in the format [(width1, height1), (width2, height2), ...].
19
+
20
+ Returns:
21
+ tuple: The best fit resolution in the format (width, height).
22
+ """
23
+ original_width, original_height = original_size
24
+ best_fit = None
25
+ max_effective_resolution = 0
26
+ min_wasted_resolution = float('inf')
27
+
28
+ for width, height in possible_resolutions:
29
+ scale = min(width / original_width, height / original_height)
30
+ downscaled_width, downscaled_height = int(original_width * scale), int(original_height * scale)
31
+ effective_resolution = min(downscaled_width * downscaled_height, original_width * original_height)
32
+ wasted_resolution = (width * height) - effective_resolution
33
+
34
+ if effective_resolution > max_effective_resolution or (effective_resolution == max_effective_resolution and wasted_resolution < min_wasted_resolution):
35
+ max_effective_resolution = effective_resolution
36
+ min_wasted_resolution = wasted_resolution
37
+ best_fit = (width, height)
38
+
39
+ return best_fit
40
+
41
+
42
+ def resize_and_pad_image(image, target_resolution):
43
+ """
44
+ Resize and pad an image to a target resolution while maintaining aspect ratio.
45
+
46
+ Args:
47
+ image (PIL.Image.Image): The input image.
48
+ target_resolution (tuple): The target resolution (width, height) of the image.
49
+
50
+ Returns:
51
+ PIL.Image.Image: The resized and padded image.
52
+ """
53
+ original_width, original_height = image.size
54
+ target_width, target_height = target_resolution
55
+
56
+ scale_w = target_width / original_width
57
+ scale_h = target_height / original_height
58
+
59
+ if scale_w < scale_h:
60
+ new_width = target_width
61
+ new_height = min(math.ceil(original_height * scale_w), target_height)
62
+ else:
63
+ new_height = target_height
64
+ new_width = min(math.ceil(original_width * scale_h), target_width)
65
+
66
+ # Resize the image
67
+ resized_image = image.resize((new_width, new_height))
68
+
69
+ new_image = Image.new('RGB', (target_width, target_height), (0, 0, 0))
70
+ paste_x = (target_width - new_width) // 2
71
+ paste_y = (target_height - new_height) // 2
72
+ new_image.paste(resized_image, (paste_x, paste_y))
73
+
74
+ return new_image
75
+
76
+
77
+ def divide_to_patches(image, patch_size):
78
+ """
79
+ Divides an image into patches of a specified size.
80
+
81
+ Args:
82
+ image (PIL.Image.Image): The input image.
83
+ patch_size (int): The size of each patch.
84
+
85
+ Returns:
86
+ list: A list of PIL.Image.Image objects representing the patches.
87
+ """
88
+ patches = []
89
+ width, height = image.size
90
+ for i in range(0, height, patch_size):
91
+ for j in range(0, width, patch_size):
92
+ box = (j, i, j + patch_size, i + patch_size)
93
+ patch = image.crop(box)
94
+ patches.append(patch)
95
+
96
+ return patches
97
+
98
+
99
+ def get_anyres_image_grid_shape(image_size, grid_pinpoints, patch_size):
100
+ """
101
+ Calculate the shape of the image patch grid after the preprocessing for images of any resolution.
102
+
103
+ Args:
104
+ image_size (tuple): The size of the input image in the format (width, height).
105
+ grid_pinpoints (str): A string representation of a list of possible resolutions.
106
+ patch_size (int): The size of each image patch.
107
+
108
+ Returns:
109
+ tuple: The shape of the image patch grid in the format (width, height).
110
+ """
111
+ if type(grid_pinpoints) is list:
112
+ possible_resolutions = grid_pinpoints
113
+ else:
114
+ possible_resolutions = ast.literal_eval(grid_pinpoints)
115
+ width, height = select_best_resolution(image_size, possible_resolutions)
116
+ return width // patch_size, height // patch_size
117
+
118
+
119
+ def process_anyres_image(image, processor, grid_pinpoints):
120
+ """
121
+ Process an image with variable resolutions.
122
+
123
+ Args:
124
+ image (PIL.Image.Image): The input image to be processed.
125
+ processor: The image processor object.
126
+ grid_pinpoints (str): A string representation of a list of possible resolutions.
127
+
128
+ Returns:
129
+ torch.Tensor: A tensor containing the processed image patches.
130
+ """
131
+ if type(grid_pinpoints) is list:
132
+ possible_resolutions = grid_pinpoints
133
+ else:
134
+ possible_resolutions = ast.literal_eval(grid_pinpoints)
135
+ best_resolution = select_best_resolution(image.size, possible_resolutions)
136
+ image_padded = resize_and_pad_image(image, best_resolution)
137
+
138
+ patches = divide_to_patches(image_padded, processor.crop_size['height'])
139
+
140
+ image_original_resize = image.resize((processor.size['shortest_edge'], processor.size['shortest_edge']))
141
+
142
+ image_patches = [image_original_resize] + patches
143
+ image_patches = [processor.preprocess(image_patch, return_tensors='pt')['pixel_values'][0]
144
+ for image_patch in image_patches]
145
+ return torch.stack(image_patches, dim=0)
146
+
147
+
148
+ def load_image_from_base64(image):
149
+ return Image.open(BytesIO(base64.b64decode(image)))
150
+
151
+
152
+ def expand2square(pil_img, background_color):
153
+ width, height = pil_img.size
154
+ if width == height:
155
+ return pil_img
156
+ elif width > height:
157
+ result = Image.new(pil_img.mode, (width, width), background_color)
158
+ result.paste(pil_img, (0, (width - height) // 2))
159
+ return result
160
+ else:
161
+ result = Image.new(pil_img.mode, (height, height), background_color)
162
+ result.paste(pil_img, ((height - width) // 2, 0))
163
+ return result
164
+
165
+
166
+ def process_images(images, image_processor, model_cfg):
167
+ image_aspect_ratio = getattr(model_cfg, "image_aspect_ratio", None)
168
+ new_images = []
169
+ if image_aspect_ratio == 'pad':
170
+ for image in images:
171
+ image = expand2square(image, tuple(int(x*255) for x in image_processor.image_mean))
172
+ image = image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
173
+ new_images.append(image)
174
+ elif image_aspect_ratio == "anyres":
175
+ for image in images:
176
+ image = process_anyres_image(image, image_processor, model_cfg.image_grid_pinpoints)
177
+ new_images.append(image)
178
+ else:
179
+ return image_processor(images, return_tensors='pt')['pixel_values']
180
+ if all(x.shape == new_images[0].shape for x in new_images):
181
+ new_images = torch.stack(new_images, dim=0)
182
+ return new_images
183
+
184
+
185
+ def tokenizer_image_token(prompt, tokenizer, image_token_index=IMAGE_TOKEN_IDX, return_tensors=None):
186
+ prompt_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split('<image>')]
187
+
188
+ def insert_separator(X, sep):
189
+ return [ele for sublist in zip(X, [sep]*len(X)) for ele in sublist][:-1]
190
+
191
+ input_ids = []
192
+ offset = 0
193
+ if len(prompt_chunks) > 0 and len(prompt_chunks[0]) > 0 and prompt_chunks[0][0] == tokenizer.bos_token_id:
194
+ offset = 1
195
+ input_ids.append(prompt_chunks[0][0])
196
+
197
+ for x in insert_separator(prompt_chunks, [image_token_index] * (offset + 1)):
198
+ input_ids.extend(x[offset:])
199
+
200
+ if return_tensors is not None:
201
+ if return_tensors == 'pt':
202
+ return torch.tensor(input_ids, dtype=torch.long)
203
+ raise ValueError(f'Unsupported tensor type: {return_tensors}')
204
+ return input_ids
205
+
206
+
207
+ def get_model_name_from_path(model_path):
208
+ model_path = model_path.strip("/")
209
+ model_paths = model_path.split("/")
210
+ if model_paths[-1].startswith('checkpoint-'):
211
+ return model_paths[-2] + "_" + model_paths[-1]
212
+ else:
213
+ return model_paths[-1]
214
+
215
+ class KeywordsStoppingCriteria(StoppingCriteria):
216
+ def __init__(self, keywords, tokenizer, input_ids):
217
+ self.keywords = keywords
218
+ self.keyword_ids = []
219
+ self.max_keyword_len = 0
220
+ for keyword in keywords:
221
+ cur_keyword_ids = tokenizer(keyword).input_ids
222
+ if len(cur_keyword_ids) > 1 and cur_keyword_ids[0] == tokenizer.bos_token_id:
223
+ cur_keyword_ids = cur_keyword_ids[1:]
224
+ if len(cur_keyword_ids) > self.max_keyword_len:
225
+ self.max_keyword_len = len(cur_keyword_ids)
226
+ self.keyword_ids.append(torch.tensor(cur_keyword_ids))
227
+ self.tokenizer = tokenizer
228
+ self.start_len = input_ids.shape[1]
229
+
230
+ def call_for_batch(self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
231
+ offset = min(output_ids.shape[1] - self.start_len, self.max_keyword_len)
232
+ self.keyword_ids = [keyword_id.to(output_ids.device) for keyword_id in self.keyword_ids]
233
+ for keyword_id in self.keyword_ids:
234
+ truncated_output_ids = output_ids[0, -keyword_id.shape[0]:]
235
+ if torch.equal(truncated_output_ids, keyword_id):
236
+ return True
237
+ outputs = self.tokenizer.batch_decode(output_ids[:, -offset:], skip_special_tokens=True)[0]
238
+ for keyword in self.keywords:
239
+ if keyword in outputs:
240
+ return True
241
+ return False
242
+
243
+ def __call__(self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
244
+ outputs = []
245
+ for i in range(output_ids.shape[0]):
246
+ outputs.append(self.call_for_batch(output_ids[i].unsqueeze(0), scores))
247
+ return all(outputs)
UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from .language_model.blip3o_qwen import blip3oQwenForCausalLM, blip3oQwenConfig
2
+ from .language_model.blip3o_qwen_inference import blip3oQwenForInferenceLM, blip3oQwenConfig
3
+
4
+
UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (397 Bytes). View file
 
UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/__pycache__/blip3o_arch.cpython-311.pyc ADDED
Binary file (20.8 kB). View file
 
UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/__pycache__/builder.cpython-311.pyc ADDED
Binary file (4.06 kB). View file
 
UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/__pycache__/llava_arch.cpython-311.pyc ADDED
Binary file (19.3 kB). View file
 
UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/__pycache__/lumina_nextdit2d.cpython-311.pyc ADDED
Binary file (17 kB). View file
 
UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/__pycache__/nextdit_crossattn.cpython-311.pyc ADDED
Binary file (4.35 kB). View file
 
UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/apply_delta.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Usage:
3
+ python3 -m fastchat.model.apply_delta --base ~/model_weights/llama-7b --target ~/model_weights/vicuna-7b --delta lmsys/vicuna-7b-delta
4
+ """
5
+ import argparse
6
+
7
+ import torch
8
+ from tqdm import tqdm
9
+ from transformers import AutoTokenizer, AutoModelForCausalLM
10
+ from blip3o import blip3oLlamaForCausalLM
11
+
12
+
13
+ def apply_delta(base_model_path, target_model_path, delta_path):
14
+ print("Loading base model")
15
+ base = AutoModelForCausalLM.from_pretrained(
16
+ base_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
17
+
18
+ print("Loading delta")
19
+ delta = blip3oLlamaForCausalLM.from_pretrained(delta_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
20
+ delta_tokenizer = AutoTokenizer.from_pretrained(delta_path)
21
+
22
+ print("Applying delta")
23
+ for name, param in tqdm(delta.state_dict().items(), desc="Applying delta"):
24
+ if name not in base.state_dict():
25
+ assert name in ['model.mm_projector.weight', 'model.mm_projector.bias'], f'{name} not in base model'
26
+ continue
27
+ if param.data.shape == base.state_dict()[name].shape:
28
+ param.data += base.state_dict()[name]
29
+ else:
30
+ assert name in ['model.embed_tokens.weight', 'lm_head.weight'], \
31
+ f'{name} dimension mismatch: {param.data.shape} vs {base.state_dict()[name].shape}'
32
+ bparam = base.state_dict()[name]
33
+ param.data[:bparam.shape[0], :bparam.shape[1]] += bparam
34
+
35
+ print("Saving target model")
36
+ delta.save_pretrained(target_model_path)
37
+ delta_tokenizer.save_pretrained(target_model_path)
38
+
39
+
40
+ if __name__ == "__main__":
41
+ parser = argparse.ArgumentParser()
42
+ parser.add_argument("--base-model-path", type=str, required=True)
43
+ parser.add_argument("--target-model-path", type=str, required=True)
44
+ parser.add_argument("--delta-path", type=str, required=True)
45
+
46
+ args = parser.parse_args()
47
+
48
+ apply_delta(args.base_model_path, args.target_model_path, args.delta_path)
UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/blip3o_arch.py ADDED
@@ -0,0 +1,371 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from abc import ABC, abstractmethod
2
+
3
+ import torch
4
+ import torch.nn as nn
5
+ import torch.nn.functional as F
6
+
7
+ from .multimodal_encoder.builder import build_vision_tower, build_gen_vision_tower, build_dit
8
+ from .multimodal_projector.builder import build_vision_projector, build_down_projector, build_gen_vision_projector
9
+
10
+ from blip3o.constants import IGNORE_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN, IMAGE_TOKEN_IDX, DEFAULT_IM_START_TOKEN_IDX, DEFAULT_IM_END_TOKEN_IDX, UND_IMAGE_TOKEN_IDX
11
+
12
+
13
+
14
+ class blip3oMetaModel:
15
+
16
+ def __init__(self, config):
17
+ super(blip3oMetaModel, self).__init__(config)
18
+
19
+
20
+
21
+ if hasattr(config, "gen_vision_tower"):
22
+ self.gen_vision_tower = build_gen_vision_tower(config, delay_load=True)
23
+ self.latent_queries = nn.Parameter(torch.randn(1, config.n_query, config.hidden_size))
24
+ print(f" latent query size {self.latent_queries.shape}")
25
+
26
+ if 'unpad' in getattr(config, 'mm_patch_merge_type', ''):
27
+ self.image_newline = nn.Parameter(
28
+ torch.empty(config.hidden_size, dtype=self.dtype)
29
+ )
30
+
31
+ self.dit, self.noise_scheduler = build_dit(config)
32
+
33
+
34
+ def get_vision_tower(self):
35
+ vision_tower = getattr(self, 'vision_tower', None)
36
+ if type(vision_tower) is list:
37
+ vision_tower = vision_tower[0]
38
+ return vision_tower
39
+
40
+
41
+ def get_gen_vision_tower(self):
42
+ gen_vision_tower = getattr(self, 'gen_vision_tower', None)
43
+ if type(gen_vision_tower) is list:
44
+ gen_vision_tower = gen_vision_tower[0]
45
+ return gen_vision_tower
46
+
47
+
48
+ def initialize_vision_modules(self, model_args, fsdp=None):
49
+ gen_vision_tower = model_args.gen_vision_tower
50
+
51
+ mm_vision_select_layer = model_args.mm_vision_select_layer
52
+ mm_vision_select_feature = model_args.mm_vision_select_feature
53
+
54
+ pretrain_mm_mlp_adapter = model_args.pretrain_mm_mlp_adapter
55
+ pretrain_gen_mlp_adapter = model_args.pretrain_gen_mlp_adapter
56
+
57
+ mm_patch_merge_type = model_args.mm_patch_merge_type
58
+
59
+ self.config.gen_vision_tower = gen_vision_tower
60
+ self.config.vision_tower_pretrained = getattr(model_args, "vision_tower_pretrained", "")
61
+
62
+
63
+
64
+ if getattr(self, 'dit', None) is None:
65
+ print("random initiation the DiT !!!")
66
+ self.dit, self.noise_scheduler = build_dit(model_args)
67
+ else:
68
+ print("DiT load from checkpoint!!!")
69
+ for p in self.dit.parameters():
70
+ p.requires_grad = True
71
+
72
+
73
+ if self.get_vision_tower() is None:
74
+ vision_tower = build_vision_tower(model_args)
75
+ if fsdp is not None and len(fsdp) > 0:
76
+ self.vision_tower = [vision_tower]
77
+ else:
78
+ self.vision_tower = vision_tower
79
+ else:
80
+ if fsdp is not None and len(fsdp) > 0:
81
+ vision_tower = self.vision_tower[0]
82
+ else:
83
+ vision_tower = self.vision_tower
84
+ vision_tower.load_model()
85
+
86
+
87
+
88
+ if self.get_gen_vision_tower() is None:
89
+ gen_vision_tower = build_gen_vision_tower(model_args)
90
+
91
+ if fsdp is not None and len(fsdp) > 0:
92
+ self.gen_vision_tower = [gen_vision_tower]
93
+ else:
94
+ self.gen_vision_tower = gen_vision_tower
95
+ else:
96
+ if fsdp is not None and len(fsdp) > 0:
97
+ gen_vision_tower = self.gen_vision_tower[0]
98
+ else:
99
+ gen_vision_tower = self.gen_vision_tower
100
+ gen_vision_tower.load_model()
101
+
102
+
103
+ self.config.use_mm_proj = True
104
+ self.config.mm_projector_type = getattr(model_args, 'mm_projector_type', 'linear')
105
+
106
+ self.config.mm_hidden_size = vision_tower.config.hidden_size
107
+
108
+
109
+ if 'eva' in model_args.gen_vision_tower:
110
+ self.config.gen_hidden_size = gen_vision_tower.hidden_size
111
+ elif 'siglip2' in model_args.gen_vision_tower:
112
+ self.config.gen_hidden_size = gen_vision_tower.config.hidden_size
113
+
114
+
115
+ self.config.mm_vision_select_layer = mm_vision_select_layer
116
+ self.config.mm_vision_select_feature = mm_vision_select_feature
117
+ self.config.mm_patch_merge_type = mm_patch_merge_type
118
+ self.config.n_query = model_args.n_query
119
+ self.config.gen_pooling = model_args.gen_pooling
120
+
121
+
122
+ if getattr(self, 'mm_projector', None) is None:
123
+ print("random initiation the image understanding projection !!!")
124
+ self.mm_projector = build_vision_projector(self.config)
125
+ if 'unpad' in mm_patch_merge_type:
126
+ embed_std = 1 / torch.sqrt(torch.tensor(self.config.hidden_size, dtype=self.dtype))
127
+ self.image_newline = nn.Parameter(
128
+ torch.randn(self.config.hidden_size, dtype=self.dtype) * embed_std
129
+ )
130
+ else:
131
+ print("Image understanding projection load from checkpoint!!!")
132
+ for p in self.mm_projector.parameters():
133
+ p.requires_grad = True
134
+
135
+
136
+
137
+ if getattr(self, 'down_projector', None) is None:
138
+ self.down_projector = build_down_projector(self.config)
139
+ else:
140
+ for p in self.down_projector.parameters():
141
+ p.requires_grad = True
142
+
143
+
144
+
145
+ if getattr(self, 'latent_queries', None) is None:
146
+ print("random initiation the latent_queries !!!")
147
+ self.latent_queries = nn.Parameter(torch.randn(1, self.config.n_query, self.config.hidden_size))
148
+ else:
149
+ print("latent_queries load from checkpoint!!!")
150
+ self.latent_queries.requires_grad = True
151
+
152
+
153
+ if pretrain_mm_mlp_adapter is not None:
154
+ mm_projector_weights = torch.load(pretrain_mm_mlp_adapter, map_location='cpu')
155
+ def get_w(weights, keyword):
156
+ return {k.split(keyword + '.')[1]: v for k, v in weights.items() if keyword in k}
157
+
158
+
159
+
160
+ def unpad_image(tensor, original_size):
161
+ """
162
+ Unpads a PyTorch tensor of a padded and resized image.
163
+
164
+ Args:
165
+ tensor (torch.Tensor): The image tensor, assumed to be in CxHxW format.
166
+ original_size (tuple): The original size of PIL image (width, height).
167
+
168
+ Returns:
169
+ torch.Tensor: The unpadded image tensor.
170
+ """
171
+ original_width, original_height = original_size
172
+ current_height, current_width = tensor.shape[1:]
173
+
174
+ original_aspect_ratio = original_width / original_height
175
+ current_aspect_ratio = current_width / current_height
176
+
177
+ if original_aspect_ratio > current_aspect_ratio:
178
+ scale_factor = current_width / original_width
179
+ new_height = int(original_height * scale_factor)
180
+ padding = (current_height - new_height) // 2
181
+ unpadded_tensor = tensor[:, padding:current_height - padding, :]
182
+ else:
183
+ scale_factor = current_height / original_height
184
+ new_width = int(original_width * scale_factor)
185
+ padding = (current_width - new_width) // 2
186
+ unpadded_tensor = tensor[:, :, padding:current_width - padding]
187
+
188
+ return unpadded_tensor
189
+
190
+
191
+ class blip3oMetaForCausalLM(ABC):
192
+
193
+ @abstractmethod
194
+ def get_model(self):
195
+ pass
196
+
197
+ def get_vision_tower(self):
198
+ return self.get_model().get_vision_tower()
199
+
200
+ def get_gen_vision_tower(self):
201
+ return self.get_model().get_gen_vision_tower()
202
+
203
+ def encode_image(self, images):
204
+
205
+ gen_vision_tower = self.get_gen_vision_tower()
206
+ device = gen_vision_tower.device
207
+ images = images.to(device)
208
+ prompt_image_embeds = gen_vision_tower(images)
209
+ prompt_image_embeds = self.pool_img(prompt_image_embeds)
210
+ num_img, _, c = prompt_image_embeds.shape
211
+ # prompt_image_embeds = prompt_image_embeds.contiguous().view(-1, c)
212
+
213
+ # ------------- compute similarity -------
214
+ all_dist = 0
215
+ count = 0
216
+ for i in range(2, prompt_image_embeds.shape[1]-1):
217
+ diff = (prompt_image_embeds[:,i,:].unsqueeze(1) - prompt_image_embeds[:,:i,:])
218
+ dist = torch.sqrt(diff.square().sum(-1)).min().item()
219
+ all_dist+=dist
220
+ count+=1
221
+ all_dist /= count
222
+ # self.dist = all_dist
223
+ # print(self.dist)
224
+
225
+ return prompt_image_embeds
226
+
227
+ def get_mm_projector(self):
228
+ return self.get_model().mm_projector
229
+
230
+
231
+ def get_n_query(self):
232
+ return self.get_model().config.n_query
233
+
234
+ def get_gen_pooling(self):
235
+ return self.get_model().config.gen_pooling
236
+
237
+
238
+
239
+ def pool_img(self, image_features):
240
+ num_img, n, c = image_features.shape
241
+ gen_pooling = self.get_gen_pooling()
242
+ n_query = self.get_n_query()
243
+ stride = int(gen_pooling.split('_')[-1])
244
+ sqrt_n = int(n**0.5)
245
+ image_features = image_features.view(-1, sqrt_n, sqrt_n, c)
246
+ image_features = (
247
+ nn.functional.adaptive_avg_pool2d(image_features.permute(0, 3, 1, 2), int(n_query**0.5))
248
+ )
249
+ return image_features
250
+
251
+ def get_sigmas(self, timesteps, device, n_dim=4, dtype=torch.float32):
252
+ sigmas = self.get_model().noise_scheduler.sigmas.to(device=device, dtype=dtype)
253
+ schedule_timesteps = self.get_model().noise_scheduler.timesteps.to(device=device)
254
+ timesteps = timesteps.to(device)
255
+ step_indices = [(schedule_timesteps == t).nonzero().item() for t in timesteps]
256
+
257
+ sigma = sigmas[step_indices].flatten()
258
+ while len(sigma.shape) < n_dim:
259
+ sigma = sigma.unsqueeze(-1)
260
+ return sigma
261
+
262
+ def mask_drop(self, latents, drop_prob=0.1):
263
+ if drop_prob <= 0:
264
+ return latents
265
+ mask = torch.bernoulli(torch.zeros(latents.shape[0], device=latents.device, dtype=latents.dtype) + drop_prob)
266
+ while len(mask.shape) < len(latents.shape):
267
+ mask = mask.unsqueeze(-1)
268
+ mask = 1 - mask # need to flip 0 <-> 1
269
+ return latents * mask
270
+
271
+ def prepare_inputs_labels_for_multimodal(
272
+ self, input_ids, position_ids, attention_mask, past_key_values, labels,
273
+ gen_images, und_images, i_s_pos, image_sizes=None
274
+ ):
275
+
276
+ vision_tower = self.get_vision_tower()
277
+ mm_projector = self.get_mm_projector()
278
+ gen_vision_tower = self.get_gen_vision_tower()
279
+ if (gen_images is None and und_images is None) or input_ids.shape[1] == 1:
280
+ return input_ids, position_ids, attention_mask, past_key_values, None, labels, None, None, None
281
+
282
+
283
+
284
+ if not gen_images is None:
285
+ prompt_image_embeds = gen_vision_tower(gen_images) # prompt_image_embeds = gen_vision_tower(gen_images).last_hidden_state
286
+ ## pooling
287
+ prompt_image_embeds = self.pool_img(prompt_image_embeds)
288
+ target_image_embeds = torch.clone(prompt_image_embeds).detach()
289
+ latent_queries = self.get_model().latent_queries.repeat(gen_images.shape[0], 1, 1)
290
+ H = latent_queries.shape[-1]
291
+ latent_queries = latent_queries.contiguous().view(-1, H)
292
+ else:
293
+ target_image_embeds = None
294
+
295
+
296
+
297
+ if not und_images is None:
298
+
299
+ und_image_embeds = vision_tower(und_images).last_hidden_state
300
+ num_img, _, c = und_image_embeds.shape
301
+ und_image_embeds = und_image_embeds.contiguous().view(-1, c)
302
+ und_image_embeds = mm_projector(und_image_embeds)
303
+
304
+
305
+ image_idx = (input_ids == IMAGE_TOKEN_IDX)
306
+ und_image_idx = (input_ids == UND_IMAGE_TOKEN_IDX)
307
+ output_indicator = labels != -100
308
+ input_indicator = labels == -100
309
+
310
+
311
+ text_embeds = self.get_model().embed_tokens(input_ids)
312
+ text_embeds = text_embeds.clone()
313
+ gen_img_idx = torch.logical_and(output_indicator, image_idx)
314
+ if not gen_images is None:
315
+ text_embeds[gen_img_idx] = latent_queries
316
+
317
+
318
+ und_img_idx = torch.logical_and(input_indicator, und_image_idx)
319
+ if not und_images is None:
320
+ text_embeds[und_img_idx] = und_image_embeds.to(text_embeds.device)[:und_img_idx.sum(), :]
321
+
322
+ labels[image_idx] = -100
323
+
324
+
325
+ return None, position_ids, attention_mask, past_key_values, text_embeds, labels, target_image_embeds
326
+
327
+
328
+
329
+ def initialize_vision_tokenizer(self, model_args, tokenizer):
330
+ if model_args.mm_use_im_patch_token:
331
+ tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True)
332
+ self.resize_token_embeddings(len(tokenizer))
333
+
334
+ if model_args.mm_use_im_start_end:
335
+ num_new_tokens = tokenizer.add_tokens([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True)
336
+ self.resize_token_embeddings(len(tokenizer))
337
+
338
+ if num_new_tokens > 0:
339
+ input_embeddings = self.get_input_embeddings().weight.data
340
+ output_embeddings = self.get_output_embeddings().weight.data
341
+
342
+ input_embeddings_avg = input_embeddings[:-num_new_tokens].mean(
343
+ dim=0, keepdim=True)
344
+ output_embeddings_avg = output_embeddings[:-num_new_tokens].mean(
345
+ dim=0, keepdim=True)
346
+
347
+ input_embeddings[-num_new_tokens:] = input_embeddings_avg
348
+ output_embeddings[-num_new_tokens:] = output_embeddings_avg
349
+
350
+ if model_args.tune_mm_mlp_adapter:
351
+ for p in self.get_input_embeddings().parameters():
352
+ p.requires_grad = True
353
+ for p in self.get_output_embeddings().parameters():
354
+ p.requires_grad = False
355
+
356
+ if model_args.pretrain_mm_mlp_adapter:
357
+ mm_projector_weights = torch.load(model_args.pretrain_mm_mlp_adapter, map_location='cpu')
358
+ embed_tokens_weight = mm_projector_weights['model.embed_tokens.weight']
359
+ assert num_new_tokens == 2
360
+ if input_embeddings.shape == embed_tokens_weight.shape:
361
+ input_embeddings[-num_new_tokens:] = embed_tokens_weight[-num_new_tokens:]
362
+ elif embed_tokens_weight.shape[0] == num_new_tokens:
363
+ input_embeddings[-num_new_tokens:] = embed_tokens_weight
364
+ else:
365
+ raise ValueError(f"Unexpected embed_tokens_weight shape. Pretrained: {embed_tokens_weight.shape}. Current: {input_embeddings.shape}. Numer of new tokens: {num_new_tokens}.")
366
+ elif model_args.mm_use_im_patch_token:
367
+ if model_args.tune_mm_mlp_adapter:
368
+ for p in self.get_input_embeddings().parameters():
369
+ p.requires_grad = False
370
+ for p in self.get_output_embeddings().parameters():
371
+ p.requires_grad = False
UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/builder.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import warnings
3
+ import shutil
4
+
5
+ from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig, BitsAndBytesConfig
6
+ import torch
7
+ from blip3o.model import *
8
+ from blip3o.constants import DEFAULT_IMAGE_PATCH_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
9
+ from blip3o.train.train import smart_tokenizer_and_embedding_resize
10
+
11
+
12
+ def load_pretrained_model(model_path, load_8bit=False, load_4bit=False, device_map="auto", device="cuda", use_flash_attn=False, **kwargs):
13
+ kwargs = {"device_map": device_map, **kwargs}
14
+
15
+ if device != "cuda":
16
+ kwargs['device_map'] = {"": device}
17
+
18
+ if load_8bit:
19
+ kwargs['load_in_8bit'] = True
20
+ elif load_4bit:
21
+ kwargs['load_in_4bit'] = True
22
+ kwargs['quantization_config'] = BitsAndBytesConfig(
23
+ load_in_4bit=True,
24
+ bnb_4bit_compute_dtype=torch.float16,
25
+ bnb_4bit_use_double_quant=True,
26
+ bnb_4bit_quant_type='nf4'
27
+ )
28
+ else:
29
+ kwargs['torch_dtype'] = torch.float16
30
+
31
+ if use_flash_attn:
32
+ kwargs['attn_implementation'] = 'flash_attention_2'
33
+
34
+
35
+ tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
36
+
37
+ model = blip3oQwenForInferenceLM.from_pretrained(model_path, low_cpu_mem_usage=True, torch_dtype=torch.float16).to('cuda:0')
38
+
39
+ image_processor = None
40
+ mm_use_im_start_end = getattr(model.config, "mm_use_im_start_end", False)
41
+ mm_use_im_patch_token = getattr(model.config, "mm_use_im_patch_token", True)
42
+ if mm_use_im_patch_token:
43
+ tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True)
44
+ if mm_use_im_start_end:
45
+ tokenizer.add_tokens([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True)
46
+ model.resize_token_embeddings(len(tokenizer))
47
+
48
+ if hasattr(model.config, "max_sequence_length"):
49
+ context_len = model.config.max_sequence_length
50
+ else:
51
+ context_len = 2048
52
+
53
+ return tokenizer, model, context_len
54
+
55
+
56
+
57
+
58
+
59
+
60
+
61
+ def load_pretrained_model_lmms_eval(model_path, load_8bit=False, load_4bit=False, device_map="auto", device="cuda", use_flash_attn=False, **kwargs):
62
+ kwargs = {"device_map": device_map, **kwargs}
63
+
64
+ if device != "cuda":
65
+ kwargs['device_map'] = {"": device}
66
+
67
+ if load_8bit:
68
+ kwargs['load_in_8bit'] = True
69
+ elif load_4bit:
70
+ kwargs['load_in_4bit'] = True
71
+ kwargs['quantization_config'] = BitsAndBytesConfig(
72
+ load_in_4bit=True,
73
+ bnb_4bit_compute_dtype=torch.float16,
74
+ bnb_4bit_use_double_quant=True,
75
+ bnb_4bit_quant_type='nf4'
76
+ )
77
+ else:
78
+ kwargs['torch_dtype'] = torch.float16
79
+
80
+ if use_flash_attn:
81
+ kwargs['attn_implementation'] = 'flash_attention_2'
82
+
83
+
84
+ tokenizer = AutoTokenizer.from_pretrained(model_path)
85
+
86
+ model = blip3oQwenForInferenceLM.from_pretrained(model_path, low_cpu_mem_usage=True, torch_dtype=torch.float16)
87
+
88
+ image_processor = None
89
+ mm_use_im_start_end = getattr(model.config, "mm_use_im_start_end", False)
90
+ mm_use_im_patch_token = getattr(model.config, "mm_use_im_patch_token", True)
91
+ if mm_use_im_patch_token:
92
+ tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True)
93
+ if mm_use_im_start_end:
94
+ tokenizer.add_tokens([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True)
95
+ model.resize_token_embeddings(len(tokenizer))
96
+
97
+ if hasattr(model.config, "max_sequence_length"):
98
+ context_len = model.config.max_sequence_length
99
+ else:
100
+ context_len = 2048
101
+
102
+ return tokenizer, model, context_len
103
+
UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/consolidate.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+
3
+ import torch
4
+ from transformers import AutoTokenizer, AutoModelForCausalLM
5
+ from blip3o.model import *
6
+ from blip3o.model.utils import auto_upgrade
7
+
8
+
9
+ def consolidate_ckpt(src_path, dst_path):
10
+ print("Loading model")
11
+ auto_upgrade(src_path)
12
+ src_model = AutoModelForCausalLM.from_pretrained(src_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
13
+ src_tokenizer = AutoTokenizer.from_pretrained(src_path, use_fast=False)
14
+ src_model.save_pretrained(dst_path)
15
+ src_tokenizer.save_pretrained(dst_path)
16
+
17
+
18
+ if __name__ == "__main__":
19
+ parser = argparse.ArgumentParser()
20
+ parser.add_argument("--src", type=str, required=True)
21
+ parser.add_argument("--dst", type=str, required=True)
22
+
23
+ args = parser.parse_args()
24
+
25
+ consolidate_ckpt(args.src, args.dst)
UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/language_model/__pycache__/blip3o_llama.cpython-311.pyc ADDED
Binary file (21.3 kB). View file
 
UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/language_model/__pycache__/blip3o_qwen.cpython-311.pyc ADDED
Binary file (20.5 kB). View file
 
UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/language_model/__pycache__/blip3o_qwen_inference.cpython-311.pyc ADDED
Binary file (20.7 kB). View file
 
UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/language_model/__pycache__/llava_llama.cpython-311.pyc ADDED
Binary file (21.3 kB). View file
 
UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/language_model/__pycache__/llava_qwen.cpython-311.pyc ADDED
Binary file (19.5 kB). View file
 
UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/language_model/blip3o_qwen.py ADDED
@@ -0,0 +1,421 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Optional, Tuple, Union, Dict
2
+ import torch
3
+ import torch.nn as nn
4
+ from PIL import Image
5
+ import torch.nn.functional as F
6
+
7
+
8
+ import transformers
9
+ from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
10
+
11
+ from transformers.modeling_outputs import CausalLMOutputWithPast
12
+ from transformers.generation.utils import GenerateOutput
13
+
14
+ from blip3o.model.blip3o_arch import blip3oMetaModel, blip3oMetaForCausalLM
15
+
16
+ from transformers import Qwen3Config, Qwen3Model, Qwen3ForCausalLM
17
+
18
+ from blip3o.constants import UND_IMAGE_TOKEN_IDX, DEFAULT_IM_START_TOKEN_IDX
19
+
20
+
21
+
22
+ from diffusers.utils.torch_utils import randn_tensor
23
+ from diffusers.pipelines.pipeline_utils import numpy_to_pil
24
+ import numpy as np
25
+ from diffusers.models import AutoencoderKL
26
+ from diffusers.schedulers import FlowMatchEulerDiscreteScheduler
27
+
28
+
29
+ class blip3oQwenConfig(Qwen3Config):
30
+ model_type = "blip3o_qwen"
31
+
32
+
33
+ class blip3oQwenModel(blip3oMetaModel, Qwen3Model):
34
+ config_class = blip3oQwenConfig
35
+
36
+ def __init__(self, config: Qwen3Config):
37
+ super(blip3oQwenModel, self).__init__(config)
38
+
39
+
40
+ class blip3oQwenForCausalLM(Qwen3ForCausalLM, blip3oMetaForCausalLM):
41
+ config_class = blip3oQwenConfig
42
+
43
+ def __init__(self, config):
44
+ Qwen3ForCausalLM.__init__(self, config)
45
+ config.model_type = "blip3o_qwen"
46
+
47
+ self.model = blip3oQwenModel(config)
48
+ self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
49
+ # Initialize weights and apply final processing
50
+ self.post_init()
51
+
52
+ def get_model(self):
53
+ return self.model
54
+
55
+
56
+ def forward(
57
+ self,
58
+ input_ids: torch.LongTensor = None,
59
+ attention_mask: Optional[torch.Tensor] = None,
60
+ position_ids: Optional[torch.LongTensor] = None,
61
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
62
+ inputs_embeds: Optional[torch.FloatTensor] = None,
63
+ labels: Optional[torch.LongTensor] = None,
64
+ ids: Optional[list] = None,
65
+ i_s_pos: Optional[list] = None,
66
+ use_cache: Optional[bool] = None,
67
+ output_attentions: Optional[bool] = None,
68
+ output_hidden_states: Optional[bool] = None,
69
+ gen_image: Optional[torch.FloatTensor] = None,
70
+ und_image: Optional[torch.FloatTensor] = None,
71
+ image_sizes: Optional[List[List[int]]] = None,
72
+ return_dict: Optional[bool] = None,
73
+ cache_position: Optional[torch.LongTensor] = None
74
+ ) -> Union[Tuple, CausalLMOutputWithPast]:
75
+
76
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
77
+ output_hidden_states = (
78
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
79
+ )
80
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
81
+ if inputs_embeds is None:
82
+ (
83
+ input_ids,
84
+ position_ids,
85
+ attention_mask,
86
+ past_key_values,
87
+ inputs_embeds,
88
+ labels,
89
+ latents
90
+ ) = self.prepare_inputs_labels_for_multimodal(
91
+ input_ids,
92
+ position_ids,
93
+ attention_mask,
94
+ past_key_values,
95
+ labels,
96
+ gen_image,
97
+ und_image,
98
+ i_s_pos,
99
+ image_sizes
100
+ )
101
+
102
+ outputs = self.model(
103
+ input_ids=input_ids,
104
+ attention_mask=attention_mask,
105
+ position_ids=position_ids,
106
+ past_key_values=past_key_values,
107
+ inputs_embeds=inputs_embeds,
108
+ use_cache=use_cache,
109
+ output_attentions=output_attentions,
110
+ output_hidden_states=output_hidden_states,
111
+ return_dict=return_dict,
112
+ )
113
+
114
+ hidden_states = outputs[0]
115
+ logits = self.lm_head(hidden_states)
116
+ logits = logits.float()
117
+
118
+
119
+ ## image understanding loss
120
+ loss = 0
121
+ if und_image is not None:
122
+ # Shift so that tokens < n predict n
123
+ shift_logits = logits[..., :-1, :].contiguous()
124
+ shift_labels = labels[..., 1:].contiguous()
125
+ # Flatten the tokens
126
+ loss_fct = torch.nn.CrossEntropyLoss()
127
+ shift_logits = shift_logits.view(-1, self.config.vocab_size)
128
+ shift_labels = shift_labels.view(-1)
129
+ # Enable model parallelism
130
+ shift_labels = shift_labels.to(shift_logits.device)
131
+ loss = loss_fct(shift_logits, shift_labels)
132
+
133
+
134
+
135
+ # image generation loss
136
+ img_loss = 0
137
+ if gen_image is not None:
138
+ img_hidden_states = []
139
+ for b in range(hidden_states.shape[0]):
140
+ if not i_s_pos[b] == -1:
141
+ img_hidden_states.append(hidden_states[b,i_s_pos[b]:i_s_pos[b]+64, :])
142
+
143
+ img_hidden_states = torch.stack(img_hidden_states, dim=0)
144
+ img_hidden_states = self.get_model().down_projector(img_hidden_states)
145
+ bsz = latents.shape[0]
146
+ dtype = latents.dtype
147
+ noise = torch.randn_like(latents, device=latents.device)
148
+ u = torch.rand(size=(bsz,), device="cpu")
149
+ indices = (u * self.get_model().noise_scheduler.config.num_train_timesteps).long()
150
+ timesteps = self.get_model().noise_scheduler.timesteps[indices].to(device=latents.device)
151
+ sigmas = self.get_sigmas(timesteps, latents.device, n_dim=latents.ndim, dtype=dtype)
152
+ noisy_latents = (1.0 - sigmas) * latents + sigmas * noise
153
+ noise_pred = self.get_model().dit(
154
+ x=noisy_latents,
155
+ timestep=timesteps,
156
+ z_latents=self.mask_drop(img_hidden_states),
157
+ )
158
+ target = noise - latents
159
+ img_loss = F.mse_loss(noise_pred.float(), target.float(), reduction="mean")
160
+
161
+
162
+
163
+
164
+
165
+ print(f"img loss {img_loss}, text loss {loss}")
166
+ total_loss = img_loss + loss
167
+
168
+
169
+
170
+ return CausalLMOutputWithPast(
171
+ loss=total_loss,
172
+ logits=logits,
173
+ past_key_values=outputs.past_key_values,
174
+ hidden_states=outputs.hidden_states,
175
+ attentions=outputs.attentions,
176
+ )
177
+
178
+
179
+ @torch.no_grad()
180
+ def generate(
181
+ self,
182
+ inputs: Optional[torch.Tensor] = None,
183
+ images: Optional[torch.Tensor] = None,
184
+ image_sizes: Optional[torch.Tensor] = None,
185
+ **kwargs,
186
+ ) -> Union[GenerateOutput, torch.LongTensor]:
187
+ position_ids = kwargs.pop("position_ids", None)
188
+ attention_mask = kwargs.pop("attention_mask", None)
189
+ if "inputs_embeds" in kwargs:
190
+ raise NotImplementedError("`inputs_embeds` is not supported")
191
+
192
+ if images is not None:
193
+ (
194
+ inputs,
195
+ position_ids,
196
+ attention_mask,
197
+ _,
198
+ inputs_embeds,
199
+ img_indicator,
200
+ _
201
+ ) = self.prepare_inputs_labels_for_understanding(
202
+ inputs,
203
+ position_ids,
204
+ attention_mask,
205
+ None,
206
+ None,
207
+ images,
208
+ image_sizes=image_sizes
209
+ )
210
+ else:
211
+ inputs_embeds = self.get_model().embed_tokens(inputs)
212
+
213
+ return super().generate(
214
+ position_ids=position_ids,
215
+ attention_mask=attention_mask,
216
+ inputs_embeds=inputs_embeds,
217
+ **kwargs
218
+ )
219
+
220
+ @torch.no_grad()
221
+ def generate_image(
222
+ self,
223
+ text: List[str],
224
+ tokenizer: AutoTokenizer,
225
+ pixel_values: Optional[torch.Tensor] = None,
226
+ image_grid_thw: Optional[torch.Tensor] = None,
227
+ max_var: Optional[float] = None,
228
+ # placeholder: str = DEFAULT_IMG_PLACEHOLDER,
229
+ ):
230
+ scheduler = FlowMatchEulerDiscreteScheduler.from_pretrained("Alpha-VLLM/Lumina-Next-SFT-diffusers", subfolder="scheduler")
231
+
232
+
233
+ N_QUERY = self.get_n_query()
234
+ inputs = tokenizer(text, padding="longest", return_tensors="pt")
235
+ device = self.get_model().device
236
+ attention_mask = inputs.attention_mask.to(device)
237
+ input_ids = inputs.input_ids.to(device) # B x N
238
+ input_ids = torch.cat([input_ids, torch.tensor([[DEFAULT_IM_START_TOKEN_IDX]]).to(device)], dim=1)
239
+ # breakpoint()
240
+
241
+
242
+ text_embeds = self.get_model().embed_tokens(input_ids)
243
+ latent_queries = self.get_model().latent_queries.repeat(text_embeds.shape[0], 1, 1)
244
+
245
+
246
+ if pixel_values is not None:
247
+ und_image_idx = (input_ids == UND_IMAGE_TOKEN_IDX)
248
+ pixel_values = pixel_values.type(self.visual.dtype)
249
+ und_image_embeds = self.visual(pixel_values, grid_thw=image_grid_thw)
250
+ text_embeds[und_image_idx] = und_image_embeds.to(text_embeds.device)[:und_image_idx.sum(), :]
251
+
252
+
253
+ text_embeds = torch.cat([text_embeds, latent_queries], dim=1)
254
+ attention_mask = torch.cat([attention_mask, torch.ones_like(latent_queries[:, :, 0])], dim=1)
255
+
256
+
257
+ outputs = self.model(
258
+ inputs_embeds=text_embeds,
259
+ attention_mask=attention_mask,
260
+ output_hidden_states=True,
261
+ return_dict=True,
262
+ )
263
+ hidden_states = outputs.hidden_states[-1][:,-N_QUERY:,:]
264
+ img_hidden_states = hidden_states
265
+ output_img = self.sample_images(img_hidden_states, scheduler)
266
+ output_img = output_img.view(1, 1792, -1).permute(0,2,1).contiguous()
267
+
268
+ return output_img
269
+
270
+ def sample_images(
271
+ self,
272
+ img_hidden_states,
273
+ scheduler,
274
+ guidance_scale: float = 3.0,
275
+ generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
276
+ num_inference_steps: int = 30,
277
+ num_images_per_prompt: int = 1,
278
+ return_tensor=False,
279
+ **kwargs,
280
+ ):
281
+
282
+ device = img_hidden_states.device
283
+ dtype = img_hidden_states.dtype
284
+
285
+
286
+ img_hidden_states_null = torch.zeros_like(img_hidden_states, device=device, dtype=dtype)
287
+ img_hidden_states_input = torch.cat([img_hidden_states_null, img_hidden_states], 0)
288
+
289
+ batch_size = img_hidden_states.shape[0]
290
+ latent_size = self.get_model().dit.config.input_size
291
+ latent_channels = self.get_model().dit.config.in_channels
292
+
293
+ latents = randn_tensor(
294
+ shape=(batch_size * num_images_per_prompt, latent_channels, latent_size, latent_size),
295
+ generator=generator,
296
+ device=device,
297
+ dtype=dtype,
298
+ )
299
+
300
+ # set step values
301
+ sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps)
302
+ scheduler.set_timesteps(num_inference_steps, sigmas=sigmas)
303
+
304
+ # Repeat z_latents and conditions for each image per prompt
305
+ img_hidden_states_input = img_hidden_states_input.repeat_interleave(num_images_per_prompt, dim=0)
306
+
307
+ for t in scheduler.timesteps:
308
+ latent_model_input = latents.repeat(2, 1, 1, 1)
309
+ if hasattr(scheduler, "scale_model_input"):
310
+ latent_model_input = scheduler.scale_model_input(latent_model_input, t)
311
+
312
+ # predict noise model_output
313
+ noise_pred = self.get_model().dit(
314
+ x=latent_model_input,
315
+ timestep=t.unsqueeze(0).expand(latent_model_input.shape[0]).to(latent_model_input.device, torch.long),
316
+ z_latents=img_hidden_states_input,
317
+ )
318
+
319
+ # perform guidance
320
+ noise_pred_uncond, noise_pred = noise_pred.chunk(2)
321
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred - noise_pred_uncond)
322
+
323
+ # compute previous image: x_t -> x_t-1
324
+ latents = scheduler.step(noise_pred, t, latents).prev_sample
325
+
326
+ # samples = self.decode_latents(latents, return_tensor=return_tensor)
327
+ return latents
328
+
329
+ def decode_latents(self, latents, normalize=True, return_tensor=False):
330
+ if isinstance(self.get_model().vae, AutoencoderKL):
331
+ latents = latents / self.get_model().vae.config.scaling_factor
332
+ if self.get_model().vae.config.shift_factor is not None:
333
+ latents = latents + self.get_model().vae.config.shift_factor
334
+ latents = latents.to(dtype=torch.float32)
335
+ samples = self.get_model().vae.decode(latents).sample
336
+ else:
337
+ samples = self.get_model().vae.decode(latents)
338
+ if normalize:
339
+ samples = (samples / 2 + 0.5).clamp(0, 1)
340
+ else:
341
+ samples = samples.clamp(-1, 1)
342
+ if return_tensor:
343
+ return samples
344
+ samples = samples.cpu().permute(0, 2, 3, 1).float().numpy()
345
+ samples = numpy_to_pil(samples)
346
+ return samples
347
+
348
+ def prepare_and_encode_inputs(
349
+ self,
350
+ inputs: List[str | Image.Image],
351
+ tokenizer: AutoTokenizer,
352
+ do_classifier_free_guidance: bool = False,
353
+ ):
354
+ # pdb.set_trace()
355
+ device = self.get_model().device
356
+ dtype = self.get_model().dtype
357
+
358
+ has_image, has_text = False, False
359
+ text_prompt, image_prompt = "", []
360
+ img_processor = self.get_vision_tower().image_processor
361
+ negative_prompt = {}
362
+
363
+ for x in inputs:
364
+ if isinstance(x, str):
365
+ has_text = True
366
+ text_prompt += x
367
+ else:
368
+ has_image = True
369
+ text_prompt += DEFAULT_IMAGE_TOKEN
370
+ image_prompt.append(img_processor.preprocess(x, return_tensors='pt')['pixel_values'])
371
+ # pdb.set_trace()
372
+ if len(image_prompt) == 0:
373
+ image_prompt = None
374
+ else:
375
+ image_prompt = torch.cat(image_prompt)
376
+ image_prompt = image_prompt.type(dtype).to(device)
377
+
378
+ if has_image and not has_text:
379
+ prompt = self.encode_images(image_prompt)
380
+ # pdb.set_trace()
381
+ if do_classifier_free_guidance:
382
+ key = "[NULL_IMAGE]"
383
+ if key not in negative_prompt:
384
+ negative_image = torch.zeros_like(image_prompt)
385
+ negative_prompt[key] = self.encode_images(negative_image)
386
+ prompt = torch.cat([prompt, negative_prompt[key]], dim=0)
387
+ else:
388
+ prompt = self.generate_image(text=[text_prompt], image=image_prompt, tokenizer=tokenizer)
389
+ if do_classifier_free_guidance:
390
+ key = ""
391
+ if key not in negative_prompt:
392
+ negative_prompt[key] = self.generate_image(text=[""], tokenizer=tokenizer)
393
+ prompt = torch.cat([prompt, negative_prompt[key]], dim=0)
394
+
395
+ gen_pooling = self.get_gen_pooling()
396
+ n_query = self.get_n_query()
397
+ num_img, _, c = prompt.shape
398
+ if 'pool2d' in gen_pooling and has_text and not 'early' in gen_pooling:
399
+ stride = int(gen_pooling.split('_')[1])
400
+ sqrt_n = int(n_query**0.5)
401
+ prompt = prompt.permute(0, 2, 1).reshape(num_img, -1, sqrt_n, sqrt_n)
402
+ prompt = F.avg_pool2d(prompt, kernel_size=(stride, stride), stride=stride)
403
+ prompt = prompt.reshape(num_img, c, -1).permute(0,2,1)
404
+ return prompt
405
+
406
+
407
+ def prepare_inputs_for_generation(self, input_ids, past_key_values=None,
408
+ inputs_embeds=None, **kwargs):
409
+ images = kwargs.pop("images", None)
410
+ image_sizes = kwargs.pop("image_sizes", None)
411
+ inputs = super().prepare_inputs_for_generation(
412
+ input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs
413
+ )
414
+ if images is not None:
415
+ inputs['images'] = images
416
+ if image_sizes is not None:
417
+ inputs['image_sizes'] = image_sizes
418
+ return inputs
419
+
420
+ AutoConfig.register("blip3o_qwen", blip3oQwenConfig)
421
+ AutoModelForCausalLM.register(blip3oQwenConfig, blip3oQwenForCausalLM)
UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/language_model/blip3o_qwen_inference.py ADDED
@@ -0,0 +1,418 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Optional, Tuple, Union, Dict
2
+ import torch
3
+ import torch.nn as nn
4
+ from PIL import Image
5
+ import torch.nn.functional as F
6
+
7
+
8
+ import transformers
9
+ from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
10
+
11
+ from transformers.modeling_outputs import CausalLMOutputWithPast
12
+ from transformers.generation.utils import GenerateOutput
13
+
14
+ from blip3o.model.blip3o_arch import blip3oMetaModel, blip3oMetaForCausalLM
15
+
16
+ from transformers import Qwen3Config, Qwen3Model, Qwen3ForCausalLM
17
+
18
+ from blip3o.constants import UND_IMAGE_TOKEN_IDX, DEFAULT_IM_START_TOKEN_IDX
19
+
20
+
21
+
22
+ from diffusers.utils.torch_utils import randn_tensor
23
+ from diffusers.pipelines.pipeline_utils import numpy_to_pil
24
+ import numpy as np
25
+ from diffusers.models import AutoencoderKL
26
+ from diffusers.schedulers import FlowMatchEulerDiscreteScheduler
27
+
28
+
29
+ class blip3oQwenConfig(Qwen3Config):
30
+ model_type = "blip3o_qwen"
31
+
32
+
33
+ class blip3oQwenModel(blip3oMetaModel, Qwen3Model):
34
+ config_class = blip3oQwenConfig
35
+
36
+ def __init__(self, config: Qwen3Config):
37
+ super(blip3oQwenModel, self).__init__(config)
38
+
39
+
40
+ class blip3oQwenForInferenceLM(Qwen3ForCausalLM, blip3oMetaForCausalLM):
41
+ config_class = blip3oQwenConfig
42
+
43
+ def __init__(self, config):
44
+ Qwen3ForCausalLM.__init__(self, config)
45
+ config.model_type = "blip3o_qwen"
46
+
47
+ self.model = blip3oQwenModel(config)
48
+ self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
49
+ # Initialize weights and apply final processing
50
+ self.post_init()
51
+
52
+ def get_model(self):
53
+ return self.model
54
+
55
+
56
+ def forward(
57
+ self,
58
+ input_ids: torch.LongTensor = None,
59
+ attention_mask: Optional[torch.Tensor] = None,
60
+ position_ids: Optional[torch.LongTensor] = None,
61
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
62
+ inputs_embeds: Optional[torch.FloatTensor] = None,
63
+ labels: Optional[torch.LongTensor] = None,
64
+ ids: Optional[list] = None,
65
+ i_s_pos: Optional[list] = None,
66
+ use_cache: Optional[bool] = None,
67
+ output_attentions: Optional[bool] = None,
68
+ output_hidden_states: Optional[bool] = None,
69
+ gen_image: Optional[torch.FloatTensor] = None,
70
+ und_image: Optional[torch.FloatTensor] = None,
71
+ image_sizes: Optional[List[List[int]]] = None,
72
+ return_dict: Optional[bool] = None,
73
+ cache_position: Optional[torch.LongTensor] = None
74
+ ) -> Union[Tuple, CausalLMOutputWithPast]:
75
+
76
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
77
+ output_hidden_states = (
78
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
79
+ )
80
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
81
+
82
+ if inputs_embeds is None:
83
+ (
84
+ input_ids,
85
+ position_ids,
86
+ attention_mask,
87
+ past_key_values,
88
+ inputs_embeds,
89
+ labels,
90
+ latents
91
+ ) = self.prepare_inputs_labels_for_multimodal(
92
+ input_ids,
93
+ position_ids,
94
+ attention_mask,
95
+ past_key_values,
96
+ labels,
97
+ gen_image,
98
+ und_image,
99
+ i_s_pos,
100
+ image_sizes
101
+ )
102
+
103
+ outputs = self.model(
104
+ input_ids=input_ids,
105
+ attention_mask=attention_mask,
106
+ position_ids=position_ids,
107
+ past_key_values=past_key_values,
108
+ inputs_embeds=inputs_embeds,
109
+ use_cache=use_cache,
110
+ output_attentions=output_attentions,
111
+ output_hidden_states=output_hidden_states,
112
+ return_dict=return_dict,
113
+ )
114
+
115
+ hidden_states = outputs[0]
116
+ logits = self.lm_head(hidden_states)
117
+ logits = logits.float()
118
+
119
+ total_loss = None
120
+ if labels is not None:
121
+ # Shift so that tokens < n predict n
122
+ shift_logits = logits[..., :-1, :].contiguous()
123
+ shift_labels = labels[..., 1:].contiguous()
124
+ # Flatten the tokens
125
+ loss_fct = torch.nn.CrossEntropyLoss()
126
+ shift_logits = shift_logits.view(-1, self.config.vocab_size)
127
+ shift_labels = shift_labels.view(-1)
128
+ # Enable model parallelism
129
+ shift_labels = shift_labels.to(shift_logits.device)
130
+ loss = loss_fct(shift_logits, shift_labels)
131
+
132
+
133
+ # compute image loss
134
+ # target_img_embeds = torch.clone(inputs_embeds.detach())[:,1:,:] # get target image emb
135
+ img_loss_funct = torch.nn.MSELoss()
136
+ # img_hidden_states = self.get_model().down_projector(hidden_states[:,-self.get_n_query():,:])
137
+ img_hidden_states = []
138
+
139
+ for b in range(hidden_states.shape[0]):
140
+ img_hidden_states.append(hidden_states[b,i_s_pos[b]:i_s_pos[b]+64,:])
141
+ img_hidden_states = torch.stack(img_hidden_states,dim=0)
142
+ img_hidden_states = self.get_model().down_projector(img_hidden_states)
143
+ # img_loss = 0.0
144
+ if latents is None:
145
+ img_loss = img_loss_funct(img_hidden_states, torch.clone(img_hidden_states.detach()))
146
+ else:
147
+ bsz = latents.shape[0]
148
+ # device = latents.device
149
+ dtype = latents.dtype
150
+ noise = torch.randn_like(latents, device=latents.device)
151
+ u = torch.rand(size=(bsz,), device="cpu")
152
+ indices = (u * self.get_model().noise_scheduler.config.num_train_timesteps).long()
153
+ timesteps = self.get_model().noise_scheduler.timesteps[indices].to(device=latents.device)
154
+ sigmas = self.get_sigmas(timesteps, latents.device, n_dim=latents.ndim, dtype=dtype)
155
+ noisy_latents = (1.0 - sigmas) * latents + sigmas * noise
156
+ noise_pred = self.get_model().dit(
157
+ x=noisy_latents,
158
+ timestep=timesteps,
159
+ z_latents=self.mask_drop(img_hidden_states),
160
+ )
161
+ target = noise - latents
162
+ img_loss = F.mse_loss(noise_pred.float(), target.float(), reduction="mean")
163
+ print(f"img loss {img_loss}")
164
+ total_loss = img_loss
165
+
166
+ return CausalLMOutputWithPast(
167
+ loss=total_loss,
168
+ logits=logits,
169
+ past_key_values=outputs.past_key_values,
170
+ hidden_states=outputs.hidden_states,
171
+ attentions=outputs.attentions,
172
+ )
173
+
174
+
175
+ @torch.no_grad()
176
+ def generate(
177
+ self,
178
+ inputs: Optional[torch.Tensor] = None,
179
+ images: Optional[torch.Tensor] = None,
180
+ image_sizes: Optional[torch.Tensor] = None,
181
+ **kwargs,
182
+ ) -> Union[GenerateOutput, torch.LongTensor]:
183
+ position_ids = kwargs.pop("position_ids", None)
184
+ attention_mask = kwargs.pop("attention_mask", None)
185
+ if "inputs_embeds" in kwargs:
186
+ raise NotImplementedError("`inputs_embeds` is not supported")
187
+
188
+ if images is not None:
189
+ (
190
+ inputs,
191
+ position_ids,
192
+ attention_mask,
193
+ _,
194
+ inputs_embeds,
195
+ img_indicator,
196
+ _
197
+ ) = self.prepare_inputs_labels_for_understanding(
198
+ inputs,
199
+ position_ids,
200
+ attention_mask,
201
+ None,
202
+ None,
203
+ images,
204
+ image_sizes=image_sizes
205
+ )
206
+ else:
207
+ inputs_embeds = self.get_model().embed_tokens(inputs)
208
+
209
+ return super().generate(
210
+ position_ids=position_ids,
211
+ attention_mask=attention_mask,
212
+ inputs_embeds=inputs_embeds,
213
+ **kwargs
214
+ )
215
+
216
+ @torch.no_grad()
217
+ def generate_image(
218
+ self,
219
+ text: List[str],
220
+ tokenizer: AutoTokenizer,
221
+ pixel_values: Optional[torch.Tensor] = None,
222
+ image_grid_thw: Optional[torch.Tensor] = None,
223
+ max_var: Optional[float] = None,
224
+ # placeholder: str = DEFAULT_IMG_PLACEHOLDER,
225
+ ):
226
+ scheduler = FlowMatchEulerDiscreteScheduler.from_pretrained("Alpha-VLLM/Lumina-Next-SFT-diffusers", subfolder="scheduler")
227
+
228
+
229
+ N_QUERY = self.get_n_query()
230
+ inputs = tokenizer(text, padding="longest", return_tensors="pt")
231
+ device = self.get_model().device
232
+ attention_mask = inputs.attention_mask.to(device)
233
+ input_ids = inputs.input_ids.to(device) # B x N
234
+ input_ids = torch.cat([input_ids, torch.tensor([[DEFAULT_IM_START_TOKEN_IDX]]).to(device)], dim=1)
235
+ # breakpoint()
236
+
237
+
238
+ text_embeds = self.get_model().embed_tokens(input_ids)
239
+ latent_queries = self.get_model().latent_queries.repeat(text_embeds.shape[0], 1, 1)
240
+
241
+
242
+ if pixel_values is not None:
243
+ und_image_idx = (input_ids == UND_IMAGE_TOKEN_IDX)
244
+ pixel_values = pixel_values.type(self.visual.dtype)
245
+ und_image_embeds = self.visual(pixel_values, grid_thw=image_grid_thw)
246
+ text_embeds[und_image_idx] = und_image_embeds.to(text_embeds.device)[:und_image_idx.sum(), :]
247
+
248
+
249
+ text_embeds = torch.cat([text_embeds, latent_queries], dim=1)
250
+ attention_mask = torch.cat([attention_mask, torch.ones_like(latent_queries[:, :, 0])], dim=1)
251
+
252
+
253
+ outputs = self.model(
254
+ inputs_embeds=text_embeds,
255
+ attention_mask=attention_mask,
256
+ output_hidden_states=True,
257
+ return_dict=True,
258
+ )
259
+ hidden_states = outputs.hidden_states[-1][:,-N_QUERY:,:]
260
+ img_hidden_states = hidden_states
261
+ output_img = self.sample_images(img_hidden_states, scheduler)
262
+ output_img = output_img.view(1, 1792, -1).permute(0,2,1).contiguous()
263
+
264
+ return output_img
265
+
266
+ def sample_images(
267
+ self,
268
+ img_hidden_states,
269
+ scheduler,
270
+ guidance_scale: float = 3.0,
271
+ generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
272
+ num_inference_steps: int = 30,
273
+ num_images_per_prompt: int = 1,
274
+ return_tensor=False,
275
+ **kwargs,
276
+ ):
277
+
278
+ device = img_hidden_states.device
279
+ dtype = img_hidden_states.dtype
280
+
281
+
282
+ img_hidden_states_null = torch.zeros_like(img_hidden_states, device=device, dtype=dtype)
283
+ img_hidden_states_input = torch.cat([img_hidden_states_null, img_hidden_states], 0)
284
+
285
+ batch_size = img_hidden_states.shape[0]
286
+ latent_size = self.get_model().dit.config.input_size
287
+ latent_channels = self.get_model().dit.config.in_channels
288
+
289
+ latents = randn_tensor(
290
+ shape=(batch_size * num_images_per_prompt, latent_channels, latent_size, latent_size),
291
+ generator=generator,
292
+ device=device,
293
+ dtype=dtype,
294
+ )
295
+
296
+ # set step values
297
+ sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps)
298
+ scheduler.set_timesteps(num_inference_steps, sigmas=sigmas)
299
+
300
+ # Repeat z_latents and conditions for each image per prompt
301
+ img_hidden_states_input = img_hidden_states_input.repeat_interleave(num_images_per_prompt, dim=0)
302
+
303
+ for t in scheduler.timesteps:
304
+ latent_model_input = latents.repeat(2, 1, 1, 1)
305
+ if hasattr(scheduler, "scale_model_input"):
306
+ latent_model_input = scheduler.scale_model_input(latent_model_input, t)
307
+
308
+ # predict noise model_output
309
+ noise_pred = self.get_model().dit(
310
+ x=latent_model_input,
311
+ timestep=t.unsqueeze(0).expand(latent_model_input.shape[0]).to(latent_model_input.device, torch.long),
312
+ z_latents=img_hidden_states_input,
313
+ )
314
+
315
+ # perform guidance
316
+ noise_pred_uncond, noise_pred = noise_pred.chunk(2)
317
+ noise_pred = noise_pred_uncond + guidance_scale * (noise_pred - noise_pred_uncond)
318
+
319
+ # compute previous image: x_t -> x_t-1
320
+ latents = scheduler.step(noise_pred, t, latents).prev_sample
321
+
322
+ # samples = self.decode_latents(latents, return_tensor=return_tensor)
323
+ # breakpoint()
324
+ return latents
325
+
326
+ def decode_latents(self, latents, normalize=True, return_tensor=False):
327
+ if isinstance(self.get_model().vae, AutoencoderKL):
328
+ latents = latents / self.get_model().vae.config.scaling_factor
329
+ if self.get_model().vae.config.shift_factor is not None:
330
+ latents = latents + self.get_model().vae.config.shift_factor
331
+ latents = latents.to(dtype=torch.float32)
332
+ samples = self.get_model().vae.decode(latents).sample
333
+ else:
334
+ samples = self.get_model().vae.decode(latents)
335
+ if normalize:
336
+ samples = (samples / 2 + 0.5).clamp(0, 1)
337
+ else:
338
+ samples = samples.clamp(-1, 1)
339
+ if return_tensor:
340
+ return samples
341
+ samples = samples.cpu().permute(0, 2, 3, 1).float().numpy()
342
+ samples = numpy_to_pil(samples)
343
+ return samples
344
+
345
+ def prepare_and_encode_inputs(
346
+ self,
347
+ inputs: List[str | Image.Image],
348
+ tokenizer: AutoTokenizer,
349
+ do_classifier_free_guidance: bool = False,
350
+ ):
351
+ # pdb.set_trace()
352
+ device = self.get_model().device
353
+ dtype = self.get_model().dtype
354
+
355
+ has_image, has_text = False, False
356
+ text_prompt, image_prompt = "", []
357
+ img_processor = self.get_vision_tower().image_processor
358
+ negative_prompt = {}
359
+
360
+ for x in inputs:
361
+ if isinstance(x, str):
362
+ has_text = True
363
+ text_prompt += x
364
+ else:
365
+ has_image = True
366
+ text_prompt += DEFAULT_IMAGE_TOKEN
367
+ image_prompt.append(img_processor.preprocess(x, return_tensors='pt')['pixel_values'])
368
+ # pdb.set_trace()
369
+ if len(image_prompt) == 0:
370
+ image_prompt = None
371
+ else:
372
+ image_prompt = torch.cat(image_prompt)
373
+ image_prompt = image_prompt.type(dtype).to(device)
374
+
375
+ if has_image and not has_text:
376
+ prompt = self.encode_images(image_prompt)
377
+ # pdb.set_trace()
378
+ if do_classifier_free_guidance:
379
+ key = "[NULL_IMAGE]"
380
+ if key not in negative_prompt:
381
+ negative_image = torch.zeros_like(image_prompt)
382
+ negative_prompt[key] = self.encode_images(negative_image)
383
+ prompt = torch.cat([prompt, negative_prompt[key]], dim=0)
384
+ else:
385
+ prompt = self.generate_image(text=[text_prompt], image=image_prompt, tokenizer=tokenizer)
386
+ if do_classifier_free_guidance:
387
+ key = ""
388
+ if key not in negative_prompt:
389
+ negative_prompt[key] = self.generate_image(text=[""], tokenizer=tokenizer)
390
+ prompt = torch.cat([prompt, negative_prompt[key]], dim=0)
391
+
392
+ gen_pooling = self.get_gen_pooling()
393
+ n_query = self.get_n_query()
394
+ num_img, _, c = prompt.shape
395
+ if 'pool2d' in gen_pooling and has_text and not 'early' in gen_pooling:
396
+ stride = int(gen_pooling.split('_')[1])
397
+ sqrt_n = int(n_query**0.5)
398
+ prompt = prompt.permute(0, 2, 1).reshape(num_img, -1, sqrt_n, sqrt_n)
399
+ prompt = F.avg_pool2d(prompt, kernel_size=(stride, stride), stride=stride)
400
+ prompt = prompt.reshape(num_img, c, -1).permute(0,2,1)
401
+ return prompt
402
+
403
+
404
+ def prepare_inputs_for_generation(self, input_ids, past_key_values=None,
405
+ inputs_embeds=None, **kwargs):
406
+ images = kwargs.pop("images", None)
407
+ image_sizes = kwargs.pop("image_sizes", None)
408
+ inputs = super().prepare_inputs_for_generation(
409
+ input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs
410
+ )
411
+ if images is not None:
412
+ inputs['images'] = images
413
+ if image_sizes is not None:
414
+ inputs['image_sizes'] = image_sizes
415
+ return inputs
416
+
417
+ AutoConfig.register("blip3o_qwen", blip3oQwenConfig)
418
+ AutoModelForCausalLM.register(blip3oQwenConfig, blip3oQwenForInferenceLM)
UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/lumina_nextdit2d.py ADDED
@@ -0,0 +1,365 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2024 Alpha-VLLM Authors and The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from typing import Any, Dict, Optional
16
+
17
+ import torch
18
+ import torch.nn as nn
19
+ from diffusers.configuration_utils import ConfigMixin, register_to_config
20
+ from diffusers.models.attention import LuminaFeedForward
21
+ from diffusers.models.attention_processor import Attention, LuminaAttnProcessor2_0
22
+ from diffusers.models.embeddings import LuminaCombinedTimestepCaptionEmbedding, LuminaPatchEmbed, PixArtAlphaTextProjection
23
+
24
+ from diffusers.models.modeling_outputs import Transformer2DModelOutput
25
+ from diffusers.models.modeling_utils import ModelMixin
26
+ from diffusers.models.normalization import LuminaLayerNormContinuous, LuminaRMSNormZero, RMSNorm
27
+ from diffusers.utils import is_torch_version, logging
28
+
29
+ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
30
+
31
+
32
+ class LuminaNextDiTBlock(nn.Module):
33
+ """
34
+ A LuminaNextDiTBlock for LuminaNextDiT2DModel.
35
+
36
+ Parameters:
37
+ dim (`int`): Embedding dimension of the input features.
38
+ num_attention_heads (`int`): Number of attention heads.
39
+ num_kv_heads (`int`):
40
+ Number of attention heads in key and value features (if using GQA), or set to None for the same as query.
41
+ multiple_of (`int`): The number of multiple of ffn layer.
42
+ ffn_dim_multiplier (`float`): The multipier factor of ffn layer dimension.
43
+ norm_eps (`float`): The eps for norm layer.
44
+ qk_norm (`bool`): normalization for query and key.
45
+ cross_attention_dim (`int`): Cross attention embedding dimension of the input text prompt hidden_states.
46
+ norm_elementwise_affine (`bool`, *optional*, defaults to True),
47
+ """
48
+
49
+ def __init__(
50
+ self,
51
+ dim: int,
52
+ num_attention_heads: int,
53
+ num_kv_heads: int,
54
+ multiple_of: int,
55
+ ffn_dim_multiplier: float,
56
+ norm_eps: float,
57
+ qk_norm: bool,
58
+ cross_attention_dim: int,
59
+ norm_elementwise_affine: bool = True,
60
+ ) -> None:
61
+ super().__init__()
62
+ self.head_dim = dim // num_attention_heads
63
+
64
+ self.gate = nn.Parameter(torch.zeros([num_attention_heads]))
65
+
66
+ # Self-attention
67
+ self.attn1 = Attention(
68
+ query_dim=dim,
69
+ cross_attention_dim=None,
70
+ dim_head=dim // num_attention_heads,
71
+ qk_norm="layer_norm_across_heads" if qk_norm else None,
72
+ heads=num_attention_heads,
73
+ kv_heads=num_kv_heads,
74
+ eps=1e-5,
75
+ bias=False,
76
+ out_bias=False,
77
+ processor=LuminaAttnProcessor2_0(),
78
+ )
79
+ self.attn1.to_out = nn.Identity()
80
+
81
+ # Cross-attention
82
+ self.attn2 = Attention(
83
+ query_dim=dim,
84
+ cross_attention_dim=cross_attention_dim,
85
+ dim_head=dim // num_attention_heads,
86
+ qk_norm="layer_norm_across_heads" if qk_norm else None,
87
+ heads=num_attention_heads,
88
+ kv_heads=num_kv_heads,
89
+ eps=1e-5,
90
+ bias=False,
91
+ out_bias=False,
92
+ processor=LuminaAttnProcessor2_0(),
93
+ )
94
+
95
+ self.feed_forward = LuminaFeedForward(
96
+ dim=dim,
97
+ inner_dim=4 * dim,
98
+ multiple_of=multiple_of,
99
+ ffn_dim_multiplier=ffn_dim_multiplier,
100
+ )
101
+
102
+ self.norm1 = LuminaRMSNormZero(
103
+ embedding_dim=dim,
104
+ norm_eps=norm_eps,
105
+ norm_elementwise_affine=norm_elementwise_affine,
106
+ )
107
+ self.ffn_norm1 = RMSNorm(dim, eps=norm_eps, elementwise_affine=norm_elementwise_affine)
108
+
109
+ self.norm2 = RMSNorm(dim, eps=norm_eps, elementwise_affine=norm_elementwise_affine)
110
+ self.ffn_norm2 = RMSNorm(dim, eps=norm_eps, elementwise_affine=norm_elementwise_affine)
111
+
112
+ self.norm1_context = RMSNorm(cross_attention_dim, eps=norm_eps, elementwise_affine=norm_elementwise_affine)
113
+
114
+ def forward(
115
+ self,
116
+ hidden_states: torch.Tensor,
117
+ attention_mask: torch.Tensor,
118
+ image_rotary_emb: torch.Tensor,
119
+ encoder_hidden_states: torch.Tensor,
120
+ encoder_mask: torch.Tensor,
121
+ temb: torch.Tensor,
122
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
123
+ ):
124
+ """
125
+ Perform a forward pass through the LuminaNextDiTBlock.
126
+
127
+ Parameters:
128
+ hidden_states (`torch.Tensor`): The input of hidden_states for LuminaNextDiTBlock.
129
+ attention_mask (`torch.Tensor): The input of hidden_states corresponse attention mask.
130
+ image_rotary_emb (`torch.Tensor`): Precomputed cosine and sine frequencies.
131
+ encoder_hidden_states: (`torch.Tensor`): The hidden_states of text prompt are processed by Gemma encoder.
132
+ encoder_mask (`torch.Tensor`): The hidden_states of text prompt attention mask.
133
+ temb (`torch.Tensor`): Timestep embedding with text prompt embedding.
134
+ cross_attention_kwargs (`Dict[str, Any]`): kwargs for cross attention.
135
+ """
136
+ residual = hidden_states
137
+
138
+ # Self-attention
139
+ norm_hidden_states, gate_msa, scale_mlp, gate_mlp = self.norm1(hidden_states, temb)
140
+ self_attn_output = self.attn1(
141
+ hidden_states=norm_hidden_states,
142
+ encoder_hidden_states=norm_hidden_states,
143
+ attention_mask=attention_mask,
144
+ query_rotary_emb=image_rotary_emb,
145
+ key_rotary_emb=image_rotary_emb,
146
+ **cross_attention_kwargs,
147
+ )
148
+
149
+ # Cross-attention
150
+ norm_encoder_hidden_states = self.norm1_context(encoder_hidden_states)
151
+ cross_attn_output = self.attn2(
152
+ hidden_states=norm_hidden_states,
153
+ encoder_hidden_states=norm_encoder_hidden_states,
154
+ attention_mask=encoder_mask,
155
+ query_rotary_emb=image_rotary_emb,
156
+ key_rotary_emb=None,
157
+ **cross_attention_kwargs,
158
+ )
159
+ cross_attn_output = cross_attn_output * self.gate.tanh().view(1, 1, -1, 1)
160
+ mixed_attn_output = self_attn_output + cross_attn_output
161
+ mixed_attn_output = mixed_attn_output.flatten(-2)
162
+ # linear proj
163
+ hidden_states = self.attn2.to_out[0](mixed_attn_output)
164
+
165
+ hidden_states = residual + gate_msa.unsqueeze(1).tanh() * self.norm2(hidden_states)
166
+
167
+ mlp_output = self.feed_forward(self.ffn_norm1(hidden_states) * (1 + scale_mlp.unsqueeze(1)))
168
+
169
+ hidden_states = hidden_states + gate_mlp.unsqueeze(1).tanh() * self.ffn_norm2(mlp_output)
170
+
171
+ return hidden_states
172
+
173
+
174
+ class LuminaNextDiT2DModel(ModelMixin, ConfigMixin):
175
+ """
176
+ LuminaNextDiT: Diffusion model with a Transformer backbone.
177
+
178
+ Inherit ModelMixin and ConfigMixin to be compatible with the sampler StableDiffusionPipeline of diffusers.
179
+
180
+ Parameters:
181
+ sample_size (`int`): The width of the latent images. This is fixed during training since
182
+ it is used to learn a number of position embeddings.
183
+ patch_size (`int`, *optional*, (`int`, *optional*, defaults to 2):
184
+ The size of each patch in the image. This parameter defines the resolution of patches fed into the model.
185
+ in_channels (`int`, *optional*, defaults to 4):
186
+ The number of input channels for the model. Typically, this matches the number of channels in the input
187
+ images.
188
+ hidden_size (`int`, *optional*, defaults to 4096):
189
+ The dimensionality of the hidden layers in the model. This parameter determines the width of the model's
190
+ hidden representations.
191
+ num_layers (`int`, *optional*, default to 32):
192
+ The number of layers in the model. This defines the depth of the neural network.
193
+ num_attention_heads (`int`, *optional*, defaults to 32):
194
+ The number of attention heads in each attention layer. This parameter specifies how many separate attention
195
+ mechanisms are used.
196
+ num_kv_heads (`int`, *optional*, defaults to 8):
197
+ The number of key-value heads in the attention mechanism, if different from the number of attention heads.
198
+ If None, it defaults to num_attention_heads.
199
+ multiple_of (`int`, *optional*, defaults to 256):
200
+ A factor that the hidden size should be a multiple of. This can help optimize certain hardware
201
+ configurations.
202
+ ffn_dim_multiplier (`float`, *optional*):
203
+ A multiplier for the dimensionality of the feed-forward network. If None, it uses a default value based on
204
+ the model configuration.
205
+ norm_eps (`float`, *optional*, defaults to 1e-5):
206
+ A small value added to the denominator for numerical stability in normalization layers.
207
+ learn_sigma (`bool`, *optional*, defaults to True):
208
+ Whether the model should learn the sigma parameter, which might be related to uncertainty or variance in
209
+ predictions.
210
+ qk_norm (`bool`, *optional*, defaults to True):
211
+ Indicates if the queries and keys in the attention mechanism should be normalized.
212
+ cross_attention_dim (`int`, *optional*, defaults to 2048):
213
+ The dimensionality of the text embeddings. This parameter defines the size of the text representations used
214
+ in the model.
215
+ scaling_factor (`float`, *optional*, defaults to 1.0):
216
+ A scaling factor applied to certain parameters or layers in the model. This can be used for adjusting the
217
+ overall scale of the model's operations.
218
+ """
219
+
220
+ _supports_gradient_checkpointing = True
221
+ _no_split_modules = ["LuminaNextDiTBlock"]
222
+
223
+ @register_to_config
224
+ def __init__(
225
+ self,
226
+ sample_size: int = 128,
227
+ patch_size: Optional[int] = 2,
228
+ in_channels: Optional[int] = 4,
229
+ hidden_size: Optional[int] = 2304,
230
+ num_layers: Optional[int] = 32, # 32
231
+ num_attention_heads: Optional[int] = 32, # 32
232
+ num_kv_heads: Optional[int] = None,
233
+ multiple_of: Optional[int] = 256,
234
+ ffn_dim_multiplier: Optional[float] = None,
235
+ norm_eps: Optional[float] = 1e-5,
236
+ learn_sigma: Optional[bool] = True,
237
+ qk_norm: Optional[bool] = True,
238
+ cross_attention_dim: Optional[int] = 2048,
239
+ scaling_factor: Optional[float] = 1.0,
240
+ ) -> None:
241
+ super().__init__()
242
+ self.sample_size = sample_size
243
+ self.patch_size = patch_size
244
+ self.in_channels = in_channels
245
+ self.out_channels = in_channels * 2 if learn_sigma else in_channels
246
+ self.hidden_size = hidden_size
247
+ self.num_attention_heads = num_attention_heads
248
+ self.head_dim = hidden_size // num_attention_heads
249
+ self.scaling_factor = scaling_factor
250
+ self.gradient_checkpointing = False
251
+
252
+ self.caption_projection = PixArtAlphaTextProjection(in_features=cross_attention_dim, hidden_size=hidden_size)
253
+ self.patch_embedder = LuminaPatchEmbed(patch_size=patch_size, in_channels=in_channels, embed_dim=hidden_size, bias=True)
254
+
255
+ self.time_caption_embed = LuminaCombinedTimestepCaptionEmbedding(hidden_size=min(hidden_size, 1024), cross_attention_dim=hidden_size)
256
+
257
+ self.layers = nn.ModuleList(
258
+ [
259
+ LuminaNextDiTBlock(
260
+ hidden_size,
261
+ num_attention_heads,
262
+ num_kv_heads,
263
+ multiple_of,
264
+ ffn_dim_multiplier,
265
+ norm_eps,
266
+ qk_norm,
267
+ hidden_size,
268
+ )
269
+ for _ in range(num_layers)
270
+ ]
271
+ )
272
+ self.norm_out = LuminaLayerNormContinuous(
273
+ embedding_dim=hidden_size,
274
+ conditioning_embedding_dim=min(hidden_size, 1024),
275
+ elementwise_affine=False,
276
+ eps=1e-6,
277
+ bias=True,
278
+ out_dim=patch_size * patch_size * self.out_channels,
279
+ )
280
+ # self.final_layer = LuminaFinalLayer(hidden_size, patch_size, self.out_channels)
281
+
282
+ assert (hidden_size // num_attention_heads) % 4 == 0, "2d rope needs head dim to be divisible by 4"
283
+
284
+ def _set_gradient_checkpointing(self, module, value=False):
285
+ if hasattr(module, "gradient_checkpointing"):
286
+ module.gradient_checkpointing = value
287
+
288
+ def forward(
289
+ self,
290
+ hidden_states: torch.Tensor,
291
+ timestep: torch.Tensor,
292
+ encoder_hidden_states: torch.Tensor,
293
+ encoder_mask: torch.Tensor,
294
+ image_rotary_emb: torch.Tensor,
295
+ cross_attention_kwargs: Dict[str, Any] = None,
296
+ return_dict=True,
297
+ ) -> torch.Tensor:
298
+ """
299
+ Forward pass of LuminaNextDiT.
300
+
301
+ Parameters:
302
+ hidden_states (torch.Tensor): Input tensor of shape (N, C, H, W).
303
+ timestep (torch.Tensor): Tensor of diffusion timesteps of shape (N,).
304
+ encoder_hidden_states (torch.Tensor): Tensor of caption features of shape (N, D).
305
+ encoder_mask (torch.Tensor): Tensor of caption masks of shape (N, L).
306
+ """
307
+ hidden_states, mask, img_size, image_rotary_emb = self.patch_embedder(hidden_states, image_rotary_emb)
308
+ image_rotary_emb = image_rotary_emb.to(hidden_states.device)
309
+ # breakpoint()
310
+ encoder_hidden_states = self.caption_projection(encoder_hidden_states)
311
+ temb = self.time_caption_embed(timestep, encoder_hidden_states, encoder_mask)
312
+
313
+ encoder_mask = encoder_mask.bool()
314
+
315
+ for layer in self.layers:
316
+ if self.training and self.gradient_checkpointing:
317
+
318
+ def create_custom_forward(module, return_dict=None):
319
+ def custom_forward(*inputs):
320
+ if return_dict is not None:
321
+ return module(*inputs, return_dict=return_dict)
322
+ else:
323
+ return module(*inputs)
324
+
325
+ return custom_forward
326
+
327
+ ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
328
+ hidden_states = torch.utils.checkpoint.checkpoint(
329
+ create_custom_forward(layer),
330
+ hidden_states,
331
+ mask,
332
+ image_rotary_emb,
333
+ encoder_hidden_states,
334
+ encoder_mask,
335
+ temb,
336
+ cross_attention_kwargs,
337
+ **ckpt_kwargs,
338
+ )
339
+ else:
340
+ hidden_states = layer(
341
+ hidden_states,
342
+ mask,
343
+ image_rotary_emb,
344
+ encoder_hidden_states,
345
+ encoder_mask,
346
+ temb=temb,
347
+ cross_attention_kwargs=cross_attention_kwargs,
348
+ )
349
+
350
+ hidden_states = self.norm_out(hidden_states, temb)
351
+
352
+ # unpatchify
353
+ height_tokens = width_tokens = self.patch_size
354
+ height, width = img_size[0]
355
+ batch_size = hidden_states.size(0)
356
+ sequence_length = (height // height_tokens) * (width // width_tokens)
357
+ hidden_states = hidden_states[:, :sequence_length].view(
358
+ batch_size, height // height_tokens, width // width_tokens, height_tokens, width_tokens, self.out_channels
359
+ )
360
+ output = hidden_states.permute(0, 5, 1, 3, 2, 4).flatten(4, 5).flatten(2, 3)
361
+
362
+ if not return_dict:
363
+ return (output,)
364
+
365
+ return Transformer2DModelOutput(sample=output)
UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/make_delta.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+
3
+ import torch
4
+ from tqdm import tqdm
5
+ from transformers import AutoTokenizer, AutoModelForCausalLM
6
+ from blip3o.model.utils import auto_upgrade
7
+
8
+
9
+ def make_delta(base_model_path, target_model_path, delta_path, hub_repo_id):
10
+ print("Loading base model")
11
+ base = AutoModelForCausalLM.from_pretrained(
12
+ base_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
13
+
14
+ print("Loading target model")
15
+ auto_upgrade(target_model_path)
16
+ target = AutoModelForCausalLM.from_pretrained(target_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
17
+
18
+ print("Calculating delta")
19
+ for name, param in tqdm(target.state_dict().items(), desc="Calculating delta"):
20
+ if name not in base.state_dict():
21
+ assert name in ['model.mm_projector.weight', 'model.mm_projector.bias'], f'{name} not in base model'
22
+ continue
23
+ if param.data.shape == base.state_dict()[name].shape:
24
+ param.data -= base.state_dict()[name]
25
+ else:
26
+ assert name in ['model.embed_tokens.weight', 'lm_head.weight'], f'{name} dimension mismatch: {param.data.shape} vs {base.state_dict()[name].shape}'
27
+ bparam = base.state_dict()[name]
28
+ param.data[:bparam.shape[0], :bparam.shape[1]] -= bparam
29
+
30
+ print("Saving delta")
31
+ if hub_repo_id:
32
+ kwargs = {"push_to_hub": True, "repo_id": hub_repo_id}
33
+ else:
34
+ kwargs = {}
35
+ target.save_pretrained(delta_path, **kwargs)
36
+ target_tokenizer = AutoTokenizer.from_pretrained(target_model_path)
37
+ target_tokenizer.save_pretrained(delta_path, **kwargs)
38
+
39
+
40
+ if __name__ == "__main__":
41
+ parser = argparse.ArgumentParser()
42
+ parser.add_argument("--base-model-path", type=str, required=True)
43
+ parser.add_argument("--target-model-path", type=str, required=True)
44
+ parser.add_argument("--delta-path", type=str, required=True)
45
+ parser.add_argument("--hub-repo-id", type=str, default=None)
46
+ args = parser.parse_args()
47
+
48
+ make_delta(args.base_model_path, args.target_model_path, args.delta_path, args.hub_repo_id)
UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/multimodal_encoder/__pycache__/builder.cpython-311.pyc ADDED
Binary file (3.5 kB). View file
 
UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/multimodal_encoder/__pycache__/clip_encoder.cpython-311.pyc ADDED
Binary file (11.9 kB). View file
 
UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/multimodal_encoder/__pycache__/imagebind.cpython-311.pyc ADDED
Binary file (4.86 kB). View file
 
UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/multimodal_encoder/__pycache__/open_clip_encoder.cpython-311.pyc ADDED
Binary file (11.2 kB). View file
 
UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/multimodal_encoder/__pycache__/siglip_encoder.cpython-311.pyc ADDED
Binary file (36.9 kB). View file
 
UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/multimodal_encoder/builder.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from .clip_encoder import CLIPVisionTower
3
+ from .imagebind import ImageBindWrapper
4
+ from .open_clip_encoder import OpenCLIPVisionTower
5
+ from .siglip_encoder import SigLipVisionTower
6
+ from .clip_encoder import CLIPVisionTower, CLIPVisionTowerS2
7
+
8
+ from .eva_clip.eva_clip_encoder import EvaClipVisionTower
9
+ from .dev_eva_clip.eva_vit import EvaViTWrapper
10
+
11
+ from blip3o.model.nextdit_crossattn import NextDiTCrossAttnConfig, NextDiTCrossAttn
12
+ from diffusers.models import AutoencoderKL
13
+ from diffusers.schedulers import FlowMatchEulerDiscreteScheduler
14
+ from transformers import AutoModel, AutoProcessor, SiglipVisionModel, AutoConfig
15
+
16
+
17
+
18
+ def build_vision_tower(vision_tower_cfg, **kwargs):
19
+ vision_tower = getattr(vision_tower_cfg, 'mm_vision_tower', getattr(vision_tower_cfg, 'vision_tower', None))
20
+ is_absolute_path_exists = os.path.exists(vision_tower)
21
+ use_s2 = getattr(vision_tower_cfg, 's2', False)
22
+ if "siglip2" in vision_tower:
23
+ return SiglipVisionModel.from_pretrained(vision_tower, attn_implementation="sdpa")
24
+ raise ValueError(f'Unknown vision tower: {vision_tower}')
25
+
26
+
27
+
28
+
29
+ def build_gen_vision_tower(vision_tower_cfg, **kwargs):
30
+ vision_tower = getattr(vision_tower_cfg, 'gen_vision_tower')
31
+ is_absolute_path_exists = os.path.exists(vision_tower)
32
+ use_s2 = getattr(vision_tower_cfg, 's2', False)
33
+ if "siglip2" in vision_tower:
34
+ return SiglipVisionModel.from_pretrained(vision_tower, attn_implementation="sdpa")
35
+ if "eva" in vision_tower:
36
+ return EvaClipVisionTower(vision_tower, args=vision_tower_cfg, **kwargs)
37
+ if is_absolute_path_exists or vision_tower.startswith("openai") or vision_tower.startswith("laion") or "ShareGPT4V" in vision_tower:
38
+ if use_s2:
39
+ return CLIPVisionTowerS2(vision_tower, args=vision_tower_cfg, **kwargs)
40
+ else:
41
+ return CLIPVisionTower(vision_tower, args=vision_tower_cfg, **kwargs)
42
+
43
+ raise ValueError(f'Unknown vision tower: {vision_tower}')
44
+
45
+
46
+
47
+ def build_dit(vision_tower_cfg, **kwargs):
48
+ if not hasattr(vision_tower_cfg, "hidden_size"):
49
+ vision_tower_cfg.hidden_size = AutoConfig.from_pretrained(vision_tower_cfg.model_name_or_path).hidden_size
50
+
51
+ dit = NextDiTCrossAttn(NextDiTCrossAttnConfig(latent_embedding_size=vision_tower_cfg.hidden_size))
52
+ noise_scheduler = FlowMatchEulerDiscreteScheduler.from_pretrained("Alpha-VLLM/Lumina-Next-SFT-diffusers", subfolder="scheduler")
53
+ return dit, noise_scheduler
54
+
55
+
UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/multimodal_encoder/clip_encoder.py ADDED
@@ -0,0 +1,172 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ from transformers import CLIPVisionModel, CLIPImageProcessor, CLIPVisionConfig
4
+
5
+ try:
6
+ from s2wrapper import forward as multiscale_forward
7
+ except:
8
+ pass
9
+
10
+
11
+ class CLIPVisionTower(nn.Module):
12
+ def __init__(self, vision_tower, args, delay_load=False):
13
+ super().__init__()
14
+
15
+ self.is_loaded = False
16
+
17
+ self.vision_tower_name = vision_tower
18
+ self.select_layer = args.mm_vision_select_layer
19
+ self.select_feature = getattr(args, "mm_vision_select_feature", "patch")
20
+
21
+ if not delay_load:
22
+ print(f"Loading vision tower: {vision_tower}")
23
+ self.load_model()
24
+ elif getattr(args, "unfreeze_mm_vision_tower", False):
25
+ # TODO: better detector is needed.
26
+ print(f"The checkpoint seems to contain `vision_tower` weights: `unfreeze_mm_vision_tower`: True.")
27
+ self.load_model()
28
+ elif hasattr(args, "mm_tunable_parts") and "mm_vision_tower" in args.mm_tunable_parts:
29
+ print(f"The checkpoint seems to contain `vision_tower` weights: `mm_tunable_parts` contains `mm_vision_tower`.")
30
+ self.load_model()
31
+ else:
32
+ self.cfg_only = CLIPVisionConfig.from_pretrained(self.vision_tower_name)
33
+
34
+ def load_model(self, device_map=None):
35
+ if self.is_loaded:
36
+ print("{} is already loaded, `load_model` called again, skipping.".format(self.vision_tower_name))
37
+ return
38
+
39
+ self.image_processor = CLIPImageProcessor.from_pretrained(self.vision_tower_name)
40
+ self.vision_tower = CLIPVisionModel.from_pretrained(self.vision_tower_name, device_map=device_map)
41
+ self.vision_tower.requires_grad_(False)
42
+
43
+ self.is_loaded = True
44
+
45
+ def feature_select(self, image_forward_outs):
46
+ select_feature_type = self.select_feature
47
+
48
+ if self.select_feature in ["slicefour_patch", "slicefour_cls_patch"]:
49
+ select_every_k_layer = len(image_forward_outs.hidden_states) // 4
50
+ image_features = torch.cat([image_forward_outs.hidden_states[i] for i in range(select_every_k_layer + self.select_layer, len(image_forward_outs.hidden_states), select_every_k_layer)], dim=-1)
51
+ select_feature_type = select_feature_type.replace("slicefour_", "")
52
+ elif self.select_feature in ["slice_m25811_f6_patch", "slice_m25811_f6_cls_patch"]:
53
+ select_layers = [-2, -5, -8, -11, 6]
54
+ image_features = torch.cat([image_forward_outs.hidden_states[i] for i in select_layers], dim=-1)
55
+ select_feature_type = select_feature_type.replace("slice_m25811_f6_", "")
56
+ else:
57
+ image_features = image_forward_outs.hidden_states[self.select_layer]
58
+
59
+ if select_feature_type == "patch":
60
+ image_features = image_features[:, 1:]
61
+ elif select_feature_type == "cls_patch":
62
+ image_features = image_features
63
+ else:
64
+ raise ValueError(f"Unexpected select feature: {select_feature_type}")
65
+ return image_features
66
+
67
+ def forward(self, images):
68
+ if type(images) is list:
69
+ image_features = []
70
+ for image in images:
71
+ image_forward_out = self.vision_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0), output_hidden_states=True)
72
+ image_feature = self.feature_select(image_forward_out).to(image.dtype)
73
+ image_features.append(image_feature)
74
+ else:
75
+ image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype), output_hidden_states=True)
76
+ image_features = self.feature_select(image_forward_outs).to(images.dtype)
77
+
78
+ return image_features
79
+
80
+ @property
81
+ def dummy_feature(self):
82
+ return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
83
+
84
+ @property
85
+ def dtype(self):
86
+ return self.vision_tower.dtype
87
+
88
+ @property
89
+ def device(self):
90
+ return self.vision_tower.device
91
+
92
+ @property
93
+ def config(self):
94
+ if self.is_loaded:
95
+ return self.vision_tower.config
96
+ else:
97
+ return self.cfg_only
98
+
99
+ @property
100
+ def hidden_size(self):
101
+ _hidden_size = self.config.hidden_size
102
+ if "slicefour" in self.select_feature:
103
+ _hidden_size *= 4
104
+ if "slice_m25811_f6" in self.select_feature:
105
+ _hidden_size *= 5
106
+ return _hidden_size
107
+
108
+ @property
109
+ def num_patches_per_side(self):
110
+ return self.config.image_size // self.config.patch_size
111
+
112
+ @property
113
+ def num_patches(self):
114
+ _num_patches = (self.config.image_size // self.config.patch_size) ** 2
115
+ if "cls_patch" in self.select_feature:
116
+ _num_patches += 1
117
+ return _num_patches
118
+
119
+ @property
120
+ def image_size(self):
121
+ return self.config.image_size
122
+
123
+
124
+ class CLIPVisionTowerS2(CLIPVisionTower):
125
+ def __init__(self, vision_tower, args, delay_load=False):
126
+
127
+ self.s2_scales = getattr(args, "s2_scales", "336,672,1008")
128
+ self.s2_scales = list(map(int, self.s2_scales.split(",")))
129
+ self.s2_scales.sort()
130
+ self.s2_split_size = self.s2_scales[0]
131
+ self.s2_image_size = self.s2_scales[-1]
132
+
133
+ super().__init__(vision_tower, args, delay_load)
134
+
135
+ # change resize/crop size in preprocessing to the largest image size in s2_scale
136
+ if not delay_load or getattr(args, "unfreeze_mm_vision_tower", False):
137
+ self.image_processor.size["shortest_edge"] = self.s2_image_size
138
+ self.image_processor.crop_size["height"] = self.image_processor.crop_size["width"] = self.s2_image_size
139
+
140
+ def load_model(self, device_map=None):
141
+ if self.is_loaded:
142
+ print("{} is already loaded, `load_model` called again, skipping.".format(self.vision_tower_name))
143
+ return
144
+
145
+ self.image_processor = CLIPImageProcessor.from_pretrained(self.vision_tower_name)
146
+ self.vision_tower = CLIPVisionModel.from_pretrained(self.vision_tower_name, device_map=device_map)
147
+ self.vision_tower.requires_grad_(False)
148
+
149
+ self.image_processor.size["shortest_edge"] = self.s2_image_size
150
+ self.image_processor.crop_size["height"] = self.image_processor.crop_size["width"] = self.s2_image_size
151
+
152
+ self.is_loaded = True
153
+
154
+ def forward_feature(self, images):
155
+ image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype), output_hidden_states=True)
156
+ image_features = self.feature_select(image_forward_outs).to(images.dtype)
157
+ return image_features
158
+
159
+ def forward(self, images):
160
+ if type(images) is list:
161
+ image_features = []
162
+ for image in images:
163
+ image_feature = multiscale_forward(self.forward_feature, image.unsqueeze(0), img_sizes=self.s2_scales, max_split_size=self.s2_split_size, split_forward=True)
164
+ image_features.append(image_feature)
165
+ else:
166
+ image_features = multiscale_forward(self.forward_feature, images, img_sizes=self.s2_scales, max_split_size=self.s2_split_size, split_forward=True)
167
+
168
+ return image_features
169
+
170
+ @property
171
+ def hidden_size(self):
172
+ return self.config.hidden_size * len(self.s2_scales)
UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/multimodal_encoder/dev_eva_clip/__pycache__/eva_vit.cpython-311.pyc ADDED
Binary file (8.76 kB). View file
 
UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/multimodal_encoder/dev_eva_clip/eva_clip/__init__.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ from .constants import OPENAI_DATASET_MEAN, OPENAI_DATASET_STD
2
+ from .factory import create_model, create_model_and_transforms, create_model_from_pretrained, get_tokenizer
3
+ from .factory import list_models, add_model_config, get_model_config, load_checkpoint
4
+ from .loss import ClipLoss
5
+ from .model import CLIP, CustomCLIP, CLIPTextCfg, CLIPVisionCfg, convert_weights_to_lp, convert_weights_to_fp16, trace_model, get_cast_dtype
6
+ from .openai import load_openai_model, list_openai_models
7
+ from .pretrained import list_pretrained, list_pretrained_models_by_tag, list_pretrained_tags_by_model, get_pretrained_url, download_pretrained_from_url, is_pretrained_cfg, get_pretrained_cfg, download_pretrained
8
+ from .tokenizer import SimpleTokenizer, tokenize
9
+ from .transform import image_transform
UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/multimodal_encoder/dev_eva_clip/eva_clip/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (1.67 kB). View file
 
UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/multimodal_encoder/dev_eva_clip/eva_clip/__pycache__/constants.cpython-311.pyc ADDED
Binary file (316 Bytes). View file
 
UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/multimodal_encoder/dev_eva_clip/eva_clip/__pycache__/eva_vit_model.cpython-311.pyc ADDED
Binary file (34.1 kB). View file
 
UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/multimodal_encoder/dev_eva_clip/eva_clip/__pycache__/factory.cpython-311.pyc ADDED
Binary file (27.8 kB). View file
 
UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/multimodal_encoder/dev_eva_clip/eva_clip/__pycache__/hf_configs.cpython-311.pyc ADDED
Binary file (830 Bytes). View file
 
UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/multimodal_encoder/dev_eva_clip/eva_clip/__pycache__/hf_model.cpython-311.pyc ADDED
Binary file (13.7 kB). View file
 
UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/multimodal_encoder/dev_eva_clip/eva_clip/__pycache__/loss.cpython-311.pyc ADDED
Binary file (6.78 kB). View file
 
UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/multimodal_encoder/dev_eva_clip/eva_clip/__pycache__/model.cpython-311.pyc ADDED
Binary file (25.3 kB). View file
 
UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/multimodal_encoder/dev_eva_clip/eva_clip/__pycache__/modified_resnet.cpython-311.pyc ADDED
Binary file (13.3 kB). View file