Upload folder using huggingface_hub
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- .gitattributes +494 -0
- UMM/BLIP3o-Qwen3-Siglip2/README.md +31 -0
- UMM/BLIP3o-Qwen3-Siglip2/blip3o/.DS_Store +0 -0
- UMM/BLIP3o-Qwen3-Siglip2/blip3o/__init__.py +0 -0
- UMM/BLIP3o-Qwen3-Siglip2/blip3o/__pycache__/__init__.cpython-311.pyc +0 -0
- UMM/BLIP3o-Qwen3-Siglip2/blip3o/__pycache__/constants.cpython-311.pyc +0 -0
- UMM/BLIP3o-Qwen3-Siglip2/blip3o/__pycache__/conversation.cpython-311.pyc +0 -0
- UMM/BLIP3o-Qwen3-Siglip2/blip3o/__pycache__/mm_utils.cpython-311.pyc +0 -0
- UMM/BLIP3o-Qwen3-Siglip2/blip3o/__pycache__/utils.cpython-311.pyc +0 -0
- UMM/BLIP3o-Qwen3-Siglip2/blip3o/constants.py +26 -0
- UMM/BLIP3o-Qwen3-Siglip2/blip3o/conversation.py +476 -0
- UMM/BLIP3o-Qwen3-Siglip2/blip3o/mm_utils.py +247 -0
- UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/__init__.py +4 -0
- UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/__pycache__/__init__.cpython-311.pyc +0 -0
- UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/__pycache__/blip3o_arch.cpython-311.pyc +0 -0
- UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/__pycache__/builder.cpython-311.pyc +0 -0
- UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/__pycache__/llava_arch.cpython-311.pyc +0 -0
- UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/__pycache__/lumina_nextdit2d.cpython-311.pyc +0 -0
- UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/__pycache__/nextdit_crossattn.cpython-311.pyc +0 -0
- UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/apply_delta.py +48 -0
- UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/blip3o_arch.py +371 -0
- UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/builder.py +103 -0
- UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/consolidate.py +25 -0
- UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/language_model/__pycache__/blip3o_llama.cpython-311.pyc +0 -0
- UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/language_model/__pycache__/blip3o_qwen.cpython-311.pyc +0 -0
- UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/language_model/__pycache__/blip3o_qwen_inference.cpython-311.pyc +0 -0
- UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/language_model/__pycache__/llava_llama.cpython-311.pyc +0 -0
- UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/language_model/__pycache__/llava_qwen.cpython-311.pyc +0 -0
- UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/language_model/blip3o_qwen.py +421 -0
- UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/language_model/blip3o_qwen_inference.py +418 -0
- UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/lumina_nextdit2d.py +365 -0
- UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/make_delta.py +48 -0
- UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/multimodal_encoder/__pycache__/builder.cpython-311.pyc +0 -0
- UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/multimodal_encoder/__pycache__/clip_encoder.cpython-311.pyc +0 -0
- UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/multimodal_encoder/__pycache__/imagebind.cpython-311.pyc +0 -0
- UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/multimodal_encoder/__pycache__/open_clip_encoder.cpython-311.pyc +0 -0
- UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/multimodal_encoder/__pycache__/siglip_encoder.cpython-311.pyc +0 -0
- UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/multimodal_encoder/builder.py +55 -0
- UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/multimodal_encoder/clip_encoder.py +172 -0
- UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/multimodal_encoder/dev_eva_clip/__pycache__/eva_vit.cpython-311.pyc +0 -0
- UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/multimodal_encoder/dev_eva_clip/eva_clip/__init__.py +9 -0
- UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/multimodal_encoder/dev_eva_clip/eva_clip/__pycache__/__init__.cpython-311.pyc +0 -0
- UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/multimodal_encoder/dev_eva_clip/eva_clip/__pycache__/constants.cpython-311.pyc +0 -0
- UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/multimodal_encoder/dev_eva_clip/eva_clip/__pycache__/eva_vit_model.cpython-311.pyc +0 -0
- UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/multimodal_encoder/dev_eva_clip/eva_clip/__pycache__/factory.cpython-311.pyc +0 -0
- UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/multimodal_encoder/dev_eva_clip/eva_clip/__pycache__/hf_configs.cpython-311.pyc +0 -0
- UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/multimodal_encoder/dev_eva_clip/eva_clip/__pycache__/hf_model.cpython-311.pyc +0 -0
- UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/multimodal_encoder/dev_eva_clip/eva_clip/__pycache__/loss.cpython-311.pyc +0 -0
- UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/multimodal_encoder/dev_eva_clip/eva_clip/__pycache__/model.cpython-311.pyc +0 -0
- UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/multimodal_encoder/dev_eva_clip/eva_clip/__pycache__/modified_resnet.cpython-311.pyc +0 -0
.gitattributes
CHANGED
|
@@ -7408,3 +7408,497 @@ not_work/GenCT-ageencoder-casualatte-frozentext2-multimlp/logs/full_ct_2d_with_b
|
|
| 7408 |
not_work/GenCT-ageencoder-casualatte-frozentext2-multimlp/logs/full_ct_2d_with_body_mask/inference/BDMAP_00009151_sample_8.png filter=lfs diff=lfs merge=lfs -text
|
| 7409 |
not_work/GenCT-ageencoder-casualatte-frozentext2-multimlp/logs/full_ct_2d_with_body_mask/inference/BDMAP_00009151_sample_9.png filter=lfs diff=lfs merge=lfs -text
|
| 7410 |
not_work/GenCT-ageencoder-casualatte-frozentext2-multimlp/misc/overview.jpg filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7408 |
not_work/GenCT-ageencoder-casualatte-frozentext2-multimlp/logs/full_ct_2d_with_body_mask/inference/BDMAP_00009151_sample_8.png filter=lfs diff=lfs merge=lfs -text
|
| 7409 |
not_work/GenCT-ageencoder-casualatte-frozentext2-multimlp/logs/full_ct_2d_with_body_mask/inference/BDMAP_00009151_sample_9.png filter=lfs diff=lfs merge=lfs -text
|
| 7410 |
not_work/GenCT-ageencoder-casualatte-frozentext2-multimlp/misc/overview.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7411 |
+
UMM/BLIP3o-Qwen3-Siglip2/eval/lmms-eval/lmms_eval/tasks/mmmu/arial.ttf filter=lfs diff=lfs merge=lfs -text
|
| 7412 |
+
UMM/BLIP3o-Qwen3-Siglip2/eval/lmms-eval/tools/live_bench/live_bench/data_generator/example/example_website.png filter=lfs diff=lfs merge=lfs -text
|
| 7413 |
+
UMM/BLIP3o-Qwen3-Siglip2/figure/arch.png filter=lfs diff=lfs merge=lfs -text
|
| 7414 |
+
UMM/BLIP3o-Qwen3-Siglip2/figure/image.png filter=lfs diff=lfs merge=lfs -text
|
| 7415 |
+
UMM/BLIP3o-Qwen3-Siglip2/figure/wechat_1.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7416 |
+
UMM/BLIP3o-Qwen3-Siglip2/figure/wechat_2.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7417 |
+
UMM/BLIP3o-Qwen3-Siglip2/gradio/animal-compare.png filter=lfs diff=lfs merge=lfs -text
|
| 7418 |
+
UMM/BLIP3o-main/eval/lmms-eval/lmms_eval/tasks/mmmu/arial.ttf filter=lfs diff=lfs merge=lfs -text
|
| 7419 |
+
UMM/BLIP3o-main/eval/lmms-eval/tools/live_bench/live_bench/data_generator/example/example_website.png filter=lfs diff=lfs merge=lfs -text
|
| 7420 |
+
UMM/BLIP3o-main/figure/arch.png filter=lfs diff=lfs merge=lfs -text
|
| 7421 |
+
UMM/BLIP3o-main/figure/image.png filter=lfs diff=lfs merge=lfs -text
|
| 7422 |
+
UMM/BLIP3o-main/figure/wechat_2.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7423 |
+
UMM/BLIP3o-main/gradio/animal-compare.png filter=lfs diff=lfs merge=lfs -text
|
| 7424 |
+
UMM/Bagel-Med/assets/arch.png filter=lfs diff=lfs merge=lfs -text
|
| 7425 |
+
UMM/Bagel-Med/assets/emerging_curves.png filter=lfs diff=lfs merge=lfs -text
|
| 7426 |
+
UMM/Bagel-Med/assets/teaser.webp filter=lfs diff=lfs merge=lfs -text
|
| 7427 |
+
UMM/Bagel-Med/inference.ipynb filter=lfs diff=lfs merge=lfs -text
|
| 7428 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/000000013318.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7429 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/000000069500.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7430 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/000000126457.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7431 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/000000128172.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7432 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/000000162435.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7433 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/000000269996.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7434 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/000000409897.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7435 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/000000429873.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7436 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/00548dfc8ec76f5d.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7437 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/008447.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7438 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/008548.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7439 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/010706.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7440 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/016721.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7441 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/024486.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7442 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/027170.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7443 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/030168.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7444 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/032672906be2e4c9.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7445 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/033215.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7446 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/036195.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7447 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/038257.png filter=lfs diff=lfs merge=lfs -text
|
| 7448 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/038468.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7449 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/043701.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7450 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/051169.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7451 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/052849.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7452 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/057680.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7453 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/060165.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7454 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/062009.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7455 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/062571.png filter=lfs diff=lfs merge=lfs -text
|
| 7456 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/063b239d-0ce7-4fdd-8984-17a823e92db8.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7457 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/064115.png filter=lfs diff=lfs merge=lfs -text
|
| 7458 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/065501.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7459 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/071939.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7460 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/071969.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7461 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/072068.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7462 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/076318.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7463 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/076910.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7464 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/077021.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7465 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/083379.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7466 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/083752.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7467 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/086628.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7468 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/089255.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7469 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/090048.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7470 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/092505.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7471 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/094888.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7472 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/096894.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7473 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/099141.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7474 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/0cd50fe4a0c2b85b.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7475 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/1-1277350-3.png filter=lfs diff=lfs merge=lfs -text
|
| 7476 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/1-15838081-3.png filter=lfs diff=lfs merge=lfs -text
|
| 7477 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/1-2082992-5.png filter=lfs diff=lfs merge=lfs -text
|
| 7478 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/1-25926120-7.png filter=lfs diff=lfs merge=lfs -text
|
| 7479 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/1-27435931-1.png filter=lfs diff=lfs merge=lfs -text
|
| 7480 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/1-29087004-3.png filter=lfs diff=lfs merge=lfs -text
|
| 7481 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/10011.jpeg.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7482 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/10751.jpeg filter=lfs diff=lfs merge=lfs -text
|
| 7483 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/10820.jpeg filter=lfs diff=lfs merge=lfs -text
|
| 7484 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/11534.jpeg filter=lfs diff=lfs merge=lfs -text
|
| 7485 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/121350e0df26f3b6.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7486 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/146c68bec53d6555.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7487 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/18310.png filter=lfs diff=lfs merge=lfs -text
|
| 7488 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/18361a25529ab455.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7489 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/19745.png filter=lfs diff=lfs merge=lfs -text
|
| 7490 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/1e4f9f282cb0cd13.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7491 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/2-10166535-6.png filter=lfs diff=lfs merge=lfs -text
|
| 7492 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/2-10220225-2.png filter=lfs diff=lfs merge=lfs -text
|
| 7493 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/2-10809142-10.png filter=lfs diff=lfs merge=lfs -text
|
| 7494 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/2-10834877-1.png filter=lfs diff=lfs merge=lfs -text
|
| 7495 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/2-1132593-1.png filter=lfs diff=lfs merge=lfs -text
|
| 7496 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/2-11783766-1.png filter=lfs diff=lfs merge=lfs -text
|
| 7497 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/2-11960713-3.png filter=lfs diff=lfs merge=lfs -text
|
| 7498 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/2-12292738-1.png filter=lfs diff=lfs merge=lfs -text
|
| 7499 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/2-13193466-16.png filter=lfs diff=lfs merge=lfs -text
|
| 7500 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/2-14415913-1.png filter=lfs diff=lfs merge=lfs -text
|
| 7501 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/2-15198842-1.png filter=lfs diff=lfs merge=lfs -text
|
| 7502 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/2-15230272-3.png filter=lfs diff=lfs merge=lfs -text
|
| 7503 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/2-15524351-19.png filter=lfs diff=lfs merge=lfs -text
|
| 7504 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/2-16452451-2.png filter=lfs diff=lfs merge=lfs -text
|
| 7505 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/2-16514275-1.png filter=lfs diff=lfs merge=lfs -text
|
| 7506 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/2-16678064-7.png filter=lfs diff=lfs merge=lfs -text
|
| 7507 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/2-16678076-2.png filter=lfs diff=lfs merge=lfs -text
|
| 7508 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/2-18662026-1.png filter=lfs diff=lfs merge=lfs -text
|
| 7509 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/2018%2F04%2F12%2F5-things-im-looking-forward-to-at-the-cc-global-summit.png filter=lfs diff=lfs merge=lfs -text
|
| 7510 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/2d48a140b40ab164.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7511 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/30800.jpeg filter=lfs diff=lfs merge=lfs -text
|
| 7512 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/30962.jpeg filter=lfs diff=lfs merge=lfs -text
|
| 7513 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/31351.jpeg filter=lfs diff=lfs merge=lfs -text
|
| 7514 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/32e250aa3b3ac720.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7515 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/35699.jpeg filter=lfs diff=lfs merge=lfs -text
|
| 7516 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/35853.jpeg filter=lfs diff=lfs merge=lfs -text
|
| 7517 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/368cd4700649c9cbd86d216dae633759_page0.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7518 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/394661ab74481234.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7519 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/39952.jpeg filter=lfs diff=lfs merge=lfs -text
|
| 7520 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/3d951179-22dd-424a-8710-c37c23cdc53f.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7521 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/3eb46fdf7959dde6.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7522 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/3ff51c29c8072953.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7523 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/4166.png filter=lfs diff=lfs merge=lfs -text
|
| 7524 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/4276c80c3b7930c9.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7525 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/47118cff0c0fc78e.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7526 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/4a605f94-9c59-4f9b-b065-8149fd85ab04.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7527 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/4d7df8bc913ce40d.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7528 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/5b6b4973fa4dab6f.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7529 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/6753f457c6586afb.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7530 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/6961db28b3fb0fa6.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7531 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/6986ef13-a72d-4f1d-a56c-5aad081fc99f.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7532 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/7cf8aae04fe5d666.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7533 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/89f64c747cf016ec.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7534 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/8bb96c2c-44ae-44d5-a9b0-bd1dae6f606e.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7535 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/9062612221d6858f.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7536 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/944610512a34aa8c.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7537 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/98c23700d1fa72a2501413686351895d_page0.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7538 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/9957.png filter=lfs diff=lfs merge=lfs -text
|
| 7539 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/9cac76ee-185e-4f5d-b37a-f3dee758b6ad.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7540 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/Q-Instruct-DB_spaq_koniq_5001866078.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7541 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/__browse__family-and-whanau__getting-married__civil-unions.png filter=lfs diff=lfs merge=lfs -text
|
| 7542 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/__story__i-help-people-live-until-very-last-moment.png filter=lfs diff=lfs merge=lfs -text
|
| 7543 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/a04da161a9471f14.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7544 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/a16af064214aa8cd.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7545 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/afcf8b37411dc885.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7546 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/allava_laion_inst_102149.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7547 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/allava_laion_inst_103811.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7548 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/allava_laion_inst_11000.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7549 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/allava_laion_inst_110113.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7550 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/allava_laion_inst_114688.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7551 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/allava_laion_inst_129672.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7552 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/allava_laion_inst_140217.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7553 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/allava_laion_inst_14538.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7554 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/allava_laion_inst_145456.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7555 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/allava_laion_inst_151486.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7556 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/allava_vflan_inst_17341.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7557 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/allava_vflan_inst_4060.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7558 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/aokvqa_00000078.png filter=lfs diff=lfs merge=lfs -text
|
| 7559 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/aokvqa_00001895.png filter=lfs diff=lfs merge=lfs -text
|
| 7560 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/aokvqa_00002763.png filter=lfs diff=lfs merge=lfs -text
|
| 7561 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/aokvqa_00003400.png filter=lfs diff=lfs merge=lfs -text
|
| 7562 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/aokvqa_00003581.png filter=lfs diff=lfs merge=lfs -text
|
| 7563 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/aokvqa_00005417.png filter=lfs diff=lfs merge=lfs -text
|
| 7564 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/aokvqa_00005631.png filter=lfs diff=lfs merge=lfs -text
|
| 7565 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/aokvqa_00007073.png filter=lfs diff=lfs merge=lfs -text
|
| 7566 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/aokvqa_00015635.png filter=lfs diff=lfs merge=lfs -text
|
| 7567 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/aokvqa_00016326.png filter=lfs diff=lfs merge=lfs -text
|
| 7568 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/aokvqa_00016496.png filter=lfs diff=lfs merge=lfs -text
|
| 7569 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/articles%2F10.3389%2Ffrym.2019.00143.png filter=lfs diff=lfs merge=lfs -text
|
| 7570 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/b7611a502ccb5e78.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7571 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/c57e7036951a271d.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7572 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/c7635ad85e7d41de.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7573 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/cf2bb1fe-8d74-4661-b47c-a0374aa6ada7.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7574 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/clevr_00000282.png filter=lfs diff=lfs merge=lfs -text
|
| 7575 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/clevr_00000673.png filter=lfs diff=lfs merge=lfs -text
|
| 7576 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/clevr_CLEVR_v1.0_images_train_CLEVR_train_001004.png filter=lfs diff=lfs merge=lfs -text
|
| 7577 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/clevr_CLEVR_v1.0_images_train_CLEVR_train_002089.png filter=lfs diff=lfs merge=lfs -text
|
| 7578 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/clevr_CLEVR_v1.0_images_train_CLEVR_train_003884.png filter=lfs diff=lfs merge=lfs -text
|
| 7579 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/clevr_CLEVR_v1.0_images_train_CLEVR_train_025922.png filter=lfs diff=lfs merge=lfs -text
|
| 7580 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/clevr_CLEVR_v1.0_images_train_CLEVR_train_044970.png filter=lfs diff=lfs merge=lfs -text
|
| 7581 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/clevr_CLEVR_v1.0_images_train_CLEVR_train_058337.png filter=lfs diff=lfs merge=lfs -text
|
| 7582 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/clevr_CLEVR_v1.0_images_train_CLEVR_train_061217.png filter=lfs diff=lfs merge=lfs -text
|
| 7583 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/coco_train2017_000000014990.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7584 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/csv_202-csv_31.png filter=lfs diff=lfs merge=lfs -text
|
| 7585 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/csv_203-csv_346.png filter=lfs diff=lfs merge=lfs -text
|
| 7586 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/csv_203-csv_687.png filter=lfs diff=lfs merge=lfs -text
|
| 7587 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/csv_204-csv_967.png filter=lfs diff=lfs merge=lfs -text
|
| 7588 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/d5c18ad80e53a3a2.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7589 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/d625b376a03157b9.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7590 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/d9d8a559-8e36-aaaa-2260-828171d65d66_page0.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7591 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/design2code_images_12.png filter=lfs diff=lfs merge=lfs -text
|
| 7592 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/e3223bf8a2246fd7.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7593 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/e46fd80ca8cffc2f6adcf05ea82fe717_page0.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7594 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/e8ee95010a24c276.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7595 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/eaa4fab593b3f51c.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7596 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/ed15edb180c89ed2.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7597 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/en__Mersing.png filter=lfs diff=lfs merge=lfs -text
|
| 7598 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/en__Northeastern_India.png filter=lfs diff=lfs merge=lfs -text
|
| 7599 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/f7e353339bb0138b.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7600 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/fd6def9b-5147-4f65-bf4b-5bb33280cfad.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7601 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/fhng0227_20.png filter=lfs diff=lfs merge=lfs -text
|
| 7602 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/geomverse_00000695.png filter=lfs diff=lfs merge=lfs -text
|
| 7603 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/geomverse_00003159.png filter=lfs diff=lfs merge=lfs -text
|
| 7604 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/geomverse_00004202.png filter=lfs diff=lfs merge=lfs -text
|
| 7605 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/geomverse_00008626.png filter=lfs diff=lfs merge=lfs -text
|
| 7606 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/gnvj0223_14.png filter=lfs diff=lfs merge=lfs -text
|
| 7607 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/gqa_images_2331056.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7608 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/hateful_memes_00000121.png filter=lfs diff=lfs merge=lfs -text
|
| 7609 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/hateful_memes_00000946.png filter=lfs diff=lfs merge=lfs -text
|
| 7610 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/hateful_memes_00001415.png filter=lfs diff=lfs merge=lfs -text
|
| 7611 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/hateful_memes_00004103.png filter=lfs diff=lfs merge=lfs -text
|
| 7612 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/hateful_memes_00004395.png filter=lfs diff=lfs merge=lfs -text
|
| 7613 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/hateful_memes_00005301.png filter=lfs diff=lfs merge=lfs -text
|
| 7614 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/hateful_memes_00006102.png filter=lfs diff=lfs merge=lfs -text
|
| 7615 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/hlvj0223_5.png filter=lfs diff=lfs merge=lfs -text
|
| 7616 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/hqcb0079_1.png filter=lfs diff=lfs merge=lfs -text
|
| 7617 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/human_system_digestive_3671.png filter=lfs diff=lfs merge=lfs -text
|
| 7618 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/human_system_muscular_6162.png filter=lfs diff=lfs merge=lfs -text
|
| 7619 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/identity_100949.png filter=lfs diff=lfs merge=lfs -text
|
| 7620 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/identity_1102.png filter=lfs diff=lfs merge=lfs -text
|
| 7621 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/identity_121996.png filter=lfs diff=lfs merge=lfs -text
|
| 7622 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/identity_178393.png filter=lfs diff=lfs merge=lfs -text
|
| 7623 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/identity_228572.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7624 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/identity_233489.png filter=lfs diff=lfs merge=lfs -text
|
| 7625 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/identity_295545.png filter=lfs diff=lfs merge=lfs -text
|
| 7626 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/image_textualization_000028931.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7627 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/image_textualization_000048745.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7628 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/image_textualization_000057758.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7629 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/image_textualization_000068046.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7630 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/infographic_vqa_00001045.png filter=lfs diff=lfs merge=lfs -text
|
| 7631 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/infographic_vqa_00001842.png filter=lfs diff=lfs merge=lfs -text
|
| 7632 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/information-sheets__schools__screening-pirated-dvds.png filter=lfs diff=lfs merge=lfs -text
|
| 7633 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/kzng0227_13.png filter=lfs diff=lfs merge=lfs -text
|
| 7634 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/kzng0227_30.png filter=lfs diff=lfs merge=lfs -text
|
| 7635 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/lhjh0227_3.png filter=lfs diff=lfs merge=lfs -text
|
| 7636 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/mapqa_00014030.png filter=lfs diff=lfs merge=lfs -text
|
| 7637 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/mapqa_00015080.png filter=lfs diff=lfs merge=lfs -text
|
| 7638 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/mapqa_00018919.png filter=lfs diff=lfs merge=lfs -text
|
| 7639 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/mapqa_00019597.png filter=lfs diff=lfs merge=lfs -text
|
| 7640 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/mapqa_00020998.png filter=lfs diff=lfs merge=lfs -text
|
| 7641 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/mapqa_00021653.png filter=lfs diff=lfs merge=lfs -text
|
| 7642 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/mapqa_00022334.png filter=lfs diff=lfs merge=lfs -text
|
| 7643 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/mapqa_00022961.png filter=lfs diff=lfs merge=lfs -text
|
| 7644 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/nldg0227_16.png filter=lfs diff=lfs merge=lfs -text
|
| 7645 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/pkbd0227_2.png filter=lfs diff=lfs merge=lfs -text
|
| 7646 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/pzmg0227_1.png filter=lfs diff=lfs merge=lfs -text
|
| 7647 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/rendered_text_00000322.png filter=lfs diff=lfs merge=lfs -text
|
| 7648 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/rendered_text_00002477.png filter=lfs diff=lfs merge=lfs -text
|
| 7649 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/rendered_text_00006606.png filter=lfs diff=lfs merge=lfs -text
|
| 7650 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/rendered_text_00006827.png filter=lfs diff=lfs merge=lfs -text
|
| 7651 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/rendered_text_00008123.png filter=lfs diff=lfs merge=lfs -text
|
| 7652 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/robut_sqa_00000688.png filter=lfs diff=lfs merge=lfs -text
|
| 7653 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/robut_sqa_00001761.png filter=lfs diff=lfs merge=lfs -text
|
| 7654 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/robut_sqa_00004235.png filter=lfs diff=lfs merge=lfs -text
|
| 7655 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/robut_sqa_00005254.png filter=lfs diff=lfs merge=lfs -text
|
| 7656 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/robut_sqa_00006488.png filter=lfs diff=lfs merge=lfs -text
|
| 7657 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/robut_wikisql_00017148.png filter=lfs diff=lfs merge=lfs -text
|
| 7658 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/robut_wikisql_00022345.png filter=lfs diff=lfs merge=lfs -text
|
| 7659 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/robut_wikisql_00025558.png filter=lfs diff=lfs merge=lfs -text
|
| 7660 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/robut_wikisql_00028965.png filter=lfs diff=lfs merge=lfs -text
|
| 7661 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/robut_wikisql_00039057.png filter=lfs diff=lfs merge=lfs -text
|
| 7662 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/robut_wikisql_00045126.png filter=lfs diff=lfs merge=lfs -text
|
| 7663 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/robut_wikisql_00046516.png filter=lfs diff=lfs merge=lfs -text
|
| 7664 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/robut_wikisql_00051301.png filter=lfs diff=lfs merge=lfs -text
|
| 7665 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/robut_wikisql_00052683.png filter=lfs diff=lfs merge=lfs -text
|
| 7666 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/robut_wikisql_00057151.png filter=lfs diff=lfs merge=lfs -text
|
| 7667 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/robut_wikisql_00064796.png filter=lfs diff=lfs merge=lfs -text
|
| 7668 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/robut_wikisql_00067312.png filter=lfs diff=lfs merge=lfs -text
|
| 7669 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/robut_wikisql_00074000.png filter=lfs diff=lfs merge=lfs -text
|
| 7670 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/robut_wtq_00005518.png filter=lfs diff=lfs merge=lfs -text
|
| 7671 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/robut_wtq_00014606.png filter=lfs diff=lfs merge=lfs -text
|
| 7672 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/robut_wtq_00016168.png filter=lfs diff=lfs merge=lfs -text
|
| 7673 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/robut_wtq_00018484.png filter=lfs diff=lfs merge=lfs -text
|
| 7674 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/robut_wtq_00018651.png filter=lfs diff=lfs merge=lfs -text
|
| 7675 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/robut_wtq_00025796.png filter=lfs diff=lfs merge=lfs -text
|
| 7676 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/robut_wtq_00026608.png filter=lfs diff=lfs merge=lfs -text
|
| 7677 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/robut_wtq_00027454.png filter=lfs diff=lfs merge=lfs -text
|
| 7678 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/robut_wtq_00035291.png filter=lfs diff=lfs merge=lfs -text
|
| 7679 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/robut_wtq_00036709.png filter=lfs diff=lfs merge=lfs -text
|
| 7680 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/robut_wtq_00037407.png filter=lfs diff=lfs merge=lfs -text
|
| 7681 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/rtxm0227_11.png filter=lfs diff=lfs merge=lfs -text
|
| 7682 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/sa_14051.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7683 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/sa_14818.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7684 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/sa_16513.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7685 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/sa_16542.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7686 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/screen2words_00002430.png filter=lfs diff=lfs merge=lfs -text
|
| 7687 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/screen2words_00003580.png filter=lfs diff=lfs merge=lfs -text
|
| 7688 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/screen2words_00003808.png filter=lfs diff=lfs merge=lfs -text
|
| 7689 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/screen2words_00005136.png filter=lfs diff=lfs merge=lfs -text
|
| 7690 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/screen2words_00008652.png filter=lfs diff=lfs merge=lfs -text
|
| 7691 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/screen2words_00011020.png filter=lfs diff=lfs merge=lfs -text
|
| 7692 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/screen2words_00015281.png filter=lfs diff=lfs merge=lfs -text
|
| 7693 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/sharegpt4o_13358.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7694 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/sharegpt4o_18904.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7695 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/sharegpt4o_26696.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7696 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/sharegpt4o_30415.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7697 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/sharegpt4o_36245.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7698 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/sharegpt4o_54791.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7699 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/st_vqa_00006958.png filter=lfs diff=lfs merge=lfs -text
|
| 7700 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/st_vqa_00007820.png filter=lfs diff=lfs merge=lfs -text
|
| 7701 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/st_vqa_00011034.png filter=lfs diff=lfs merge=lfs -text
|
| 7702 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/st_vqa_00012481.png filter=lfs diff=lfs merge=lfs -text
|
| 7703 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/st_vqa_00012552.png filter=lfs diff=lfs merge=lfs -text
|
| 7704 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/st_vqa_00012968.png filter=lfs diff=lfs merge=lfs -text
|
| 7705 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/st_vqa_00014977.png filter=lfs diff=lfs merge=lfs -text
|
| 7706 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/st_vqa_00015303.png filter=lfs diff=lfs merge=lfs -text
|
| 7707 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/stbm0227_16.png filter=lfs diff=lfs merge=lfs -text
|
| 7708 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/tallyqa_00000636.png filter=lfs diff=lfs merge=lfs -text
|
| 7709 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/tallyqa_00010487.png filter=lfs diff=lfs merge=lfs -text
|
| 7710 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/tallyqa_VG_100K_2322803.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7711 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/tallyqa_VG_100K_2_2386823.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7712 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/vision-flan_CLEVR_CoGenT+Multiple_Question_Answering+CLEVR_trainA_062588.png filter=lfs diff=lfs merge=lfs -text
|
| 7713 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/vision-flan_CLEVR_CoGenT+Question_Answer_Matching+CLEVR_trainA_003809.png filter=lfs diff=lfs merge=lfs -text
|
| 7714 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/vision-flan_CLEVR_CoGenT+VQA_context+CLEVR_trainA_029052.png filter=lfs diff=lfs merge=lfs -text
|
| 7715 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/vision-flan_Clevr+VQA_context+CLEVR_train_021395.png filter=lfs diff=lfs merge=lfs -text
|
| 7716 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/vision-flan_Clevr+VQA_context+CLEVR_train_050800.png filter=lfs diff=lfs merge=lfs -text
|
| 7717 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/vision-flan_Dark-Zurich+time_classification+GOPR0345_frame_000691_rgb_anon.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7718 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/vision-flan_FGVC_Aircraft+Aircraft_Classification_Family+2235142.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7719 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/vision-flan_MEMOTION+sentiment_detection+image_329.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7720 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/vision-flan_VQG+question_generation+710487a5-43a2-4f69-9c5b-3b21ac207395_0.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7721 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/vision-flan_WIKIART+art_classification+ivan-aivazovsky_dusk-on-the-golden-horn-1845.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7722 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/vision-flan_infographicvqa+single_document_question+30989.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7723 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/vision-flan_semart+image_school+1683.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7724 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/vision-flan_textcaps+caption_generation+20971.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7725 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/visual7w_00004696.png filter=lfs diff=lfs merge=lfs -text
|
| 7726 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/visual7w_00008404.png filter=lfs diff=lfs merge=lfs -text
|
| 7727 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/visual7w_00010373.png filter=lfs diff=lfs merge=lfs -text
|
| 7728 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/visual7w_00013878.png filter=lfs diff=lfs merge=lfs -text
|
| 7729 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/vsr_00001436.png filter=lfs diff=lfs merge=lfs -text
|
| 7730 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/websight_00001692.png filter=lfs diff=lfs merge=lfs -text
|
| 7731 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/websight_00002626.png filter=lfs diff=lfs merge=lfs -text
|
| 7732 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/wiki__At_least_30_dead_after_bus_crashes_off_cliff_and_into_river_in_South_Africa.png filter=lfs diff=lfs merge=lfs -text
|
| 7733 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/wiki__Chile_elects_first_woman_President.png filter=lfs diff=lfs merge=lfs -text
|
| 7734 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/xggn0226_15.png filter=lfs diff=lfs merge=lfs -text
|
| 7735 |
+
UMM/Bagel-Med/train_data/bagel_example/vlm/images/xrcy0227_80.png filter=lfs diff=lfs merge=lfs -text
|
| 7736 |
+
UMM/Qwen3-VL/cookbooks/assets/agent_function_call/mobile_en_example.png filter=lfs diff=lfs merge=lfs -text
|
| 7737 |
+
UMM/Qwen3-VL/cookbooks/assets/agent_function_call/mobile_zh_example.png filter=lfs diff=lfs merge=lfs -text
|
| 7738 |
+
UMM/Qwen3-VL/cookbooks/assets/computer_use/computer_use1.jpeg filter=lfs diff=lfs merge=lfs -text
|
| 7739 |
+
UMM/Qwen3-VL/cookbooks/assets/computer_use/computer_use2.jpeg filter=lfs diff=lfs merge=lfs -text
|
| 7740 |
+
UMM/Qwen3-VL/cookbooks/assets/document_parsing/docparsing_example1.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7741 |
+
UMM/Qwen3-VL/cookbooks/assets/document_parsing/docparsing_example2.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7742 |
+
UMM/Qwen3-VL/cookbooks/assets/document_parsing/docparsing_example3.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7743 |
+
UMM/Qwen3-VL/cookbooks/assets/document_parsing/docparsing_example4.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7744 |
+
UMM/Qwen3-VL/cookbooks/assets/document_parsing/docparsing_example5.png filter=lfs diff=lfs merge=lfs -text
|
| 7745 |
+
UMM/Qwen3-VL/cookbooks/assets/document_parsing/docparsing_example6.png filter=lfs diff=lfs merge=lfs -text
|
| 7746 |
+
UMM/Qwen3-VL/cookbooks/assets/document_parsing/docparsing_example7.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7747 |
+
UMM/Qwen3-VL/cookbooks/assets/document_parsing/docparsing_example8.png filter=lfs diff=lfs merge=lfs -text
|
| 7748 |
+
UMM/Qwen3-VL/cookbooks/assets/multimodal_coding/screenshot_demo.png filter=lfs diff=lfs merge=lfs -text
|
| 7749 |
+
UMM/Qwen3-VL/cookbooks/assets/multimodal_coding/sketch2code_input.jpeg filter=lfs diff=lfs merge=lfs -text
|
| 7750 |
+
UMM/Qwen3-VL/cookbooks/assets/ocr/.ipynb_checkpoints/ocr_example1-checkpoint.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7751 |
+
UMM/Qwen3-VL/cookbooks/assets/ocr/.ipynb_checkpoints/ocr_example2-checkpoint.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7752 |
+
UMM/Qwen3-VL/cookbooks/assets/ocr/.ipynb_checkpoints/ocr_example3-checkpoint.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7753 |
+
UMM/Qwen3-VL/cookbooks/assets/ocr/ocr_example1.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7754 |
+
UMM/Qwen3-VL/cookbooks/assets/ocr/ocr_example3.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7755 |
+
UMM/Qwen3-VL/cookbooks/assets/ocr/ocr_example5.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7756 |
+
UMM/Qwen3-VL/cookbooks/assets/omni_recognition/sample-anime-result.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7757 |
+
UMM/Qwen3-VL/cookbooks/assets/omni_recognition/sample-anime.jpeg filter=lfs diff=lfs merge=lfs -text
|
| 7758 |
+
UMM/Qwen3-VL/cookbooks/assets/omni_recognition/sample-bird-result.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7759 |
+
UMM/Qwen3-VL/cookbooks/assets/omni_recognition/sample-bird.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7760 |
+
UMM/Qwen3-VL/cookbooks/assets/omni_recognition/sample-celebrity-result.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7761 |
+
UMM/Qwen3-VL/cookbooks/assets/omni_recognition/sample-celebrity.jpeg filter=lfs diff=lfs merge=lfs -text
|
| 7762 |
+
UMM/Qwen3-VL/cookbooks/assets/omni_recognition/sample-food-result.jpeg filter=lfs diff=lfs merge=lfs -text
|
| 7763 |
+
UMM/Qwen3-VL/cookbooks/assets/omni_recognition/sample-food.jpeg filter=lfs diff=lfs merge=lfs -text
|
| 7764 |
+
UMM/Qwen3-VL/cookbooks/assets/qwenagent/hopinn.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7765 |
+
UMM/Qwen3-VL/cookbooks/assets/spatial_understanding/autonomous_driving.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7766 |
+
UMM/Qwen3-VL/cookbooks/assets/spatial_understanding/dining_table.png filter=lfs diff=lfs merge=lfs -text
|
| 7767 |
+
UMM/Qwen3-VL/cookbooks/assets/spatial_understanding/drone_cars2.png filter=lfs diff=lfs merge=lfs -text
|
| 7768 |
+
UMM/Qwen3-VL/cookbooks/assets/spatial_understanding/football_field.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7769 |
+
UMM/Qwen3-VL/cookbooks/assets/spatial_understanding/lots_of_cars.png filter=lfs diff=lfs merge=lfs -text
|
| 7770 |
+
UMM/Qwen3-VL/cookbooks/assets/spatial_understanding/lots_of_people.jpeg filter=lfs diff=lfs merge=lfs -text
|
| 7771 |
+
UMM/Qwen3-VL/cookbooks/assets/spatial_understanding/office.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7772 |
+
UMM/Qwen3-VL/cookbooks/assets/spatial_understanding/spatio_case2_aff.png filter=lfs diff=lfs merge=lfs -text
|
| 7773 |
+
UMM/Qwen3-VL/cookbooks/assets/spatial_understanding/spatio_case2_aff2.png filter=lfs diff=lfs merge=lfs -text
|
| 7774 |
+
UMM/Qwen3-VL/cookbooks/assets/spatial_understanding/spatio_case2_plan.png filter=lfs diff=lfs merge=lfs -text
|
| 7775 |
+
UMM/Qwen3-VL/cookbooks/assets/spatial_understanding/spatio_case2_plan2.png filter=lfs diff=lfs merge=lfs -text
|
| 7776 |
+
UMM/Qwen3-VL/qwen-vl-finetune/demo/images/COCO_train2014_000000580957.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7777 |
+
UMM/Qwen3-VL/qwen-vl-finetune/demo/videos/v_7bUu05RIksU.mp4 filter=lfs diff=lfs merge=lfs -text
|
| 7778 |
+
UMM/Qwen3-VL/qwen-vl-finetune/demo/videos/v_TpB_zMG3XBA.mp4 filter=lfs diff=lfs merge=lfs -text
|
| 7779 |
+
UMM/Qwen3-VL/qwen-vl-finetune/demo/videos/v_rBMQFpHspmo.mp4 filter=lfs diff=lfs merge=lfs -text
|
| 7780 |
+
UMM/Show-o/docs/characteristic_comparison.png filter=lfs diff=lfs merge=lfs -text
|
| 7781 |
+
UMM/Show-o/docs/demo1.png filter=lfs diff=lfs merge=lfs -text
|
| 7782 |
+
UMM/Show-o/docs/demo2.png filter=lfs diff=lfs merge=lfs -text
|
| 7783 |
+
UMM/Show-o/docs/demo3.png filter=lfs diff=lfs merge=lfs -text
|
| 7784 |
+
UMM/Show-o/docs/github_extrapolation.png filter=lfs diff=lfs merge=lfs -text
|
| 7785 |
+
UMM/Show-o/docs/github_inpainting.png filter=lfs diff=lfs merge=lfs -text
|
| 7786 |
+
UMM/Show-o/docs/github_mmu.png filter=lfs diff=lfs merge=lfs -text
|
| 7787 |
+
UMM/Show-o/docs/github_t2i.png filter=lfs diff=lfs merge=lfs -text
|
| 7788 |
+
UMM/Show-o/docs/overview.png filter=lfs diff=lfs merge=lfs -text
|
| 7789 |
+
UMM/Show-o/docs/show-o-512x512-mmu.png filter=lfs diff=lfs merge=lfs -text
|
| 7790 |
+
UMM/Show-o/docs/show-o-512x512-t2i.png filter=lfs diff=lfs merge=lfs -text
|
| 7791 |
+
UMM/Show-o/docs/show-o-geneval.png filter=lfs diff=lfs merge=lfs -text
|
| 7792 |
+
UMM/Show-o/docs/show-o-want-u.png filter=lfs diff=lfs merge=lfs -text
|
| 7793 |
+
UMM/Show-o/docs/showo.png filter=lfs diff=lfs merge=lfs -text
|
| 7794 |
+
UMM/Show-o/docs/videos/i2v_1.gif filter=lfs diff=lfs merge=lfs -text
|
| 7795 |
+
UMM/Show-o/docs/videos/i2v_2.gif filter=lfs diff=lfs merge=lfs -text
|
| 7796 |
+
UMM/Show-o/docs/videos/i2v_3.gif filter=lfs diff=lfs merge=lfs -text
|
| 7797 |
+
UMM/Show-o/docs/videos/i2v_4.gif filter=lfs diff=lfs merge=lfs -text
|
| 7798 |
+
UMM/Show-o/docs/videos/sky.gif filter=lfs diff=lfs merge=lfs -text
|
| 7799 |
+
UMM/Show-o/docs/videos/waves.gif filter=lfs diff=lfs merge=lfs -text
|
| 7800 |
+
UMM/Show-o/docs/wechat_qa_3.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7801 |
+
UMM/Show-o/inpainting_validation/alpine_lake.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7802 |
+
UMM/Show-o/inpainting_validation/bench.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7803 |
+
UMM/Show-o/inpainting_validation/bus.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7804 |
+
UMM/Show-o/inpainting_validation/maya.png filter=lfs diff=lfs merge=lfs -text
|
| 7805 |
+
UMM/Show-o/inpainting_validation/river.png filter=lfs diff=lfs merge=lfs -text
|
| 7806 |
+
UMM/Show-o/inpainting_validation/train.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7807 |
+
UMM/Show-o/inpainting_validation/truebsee.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7808 |
+
UMM/Show-o/inpainting_validation/wukong1.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7809 |
+
UMM/Show-o/mmu_validation/sofa_under_water.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7810 |
+
UMM/Show-o/show-o2/docs/comparative_analysis.png filter=lfs diff=lfs merge=lfs -text
|
| 7811 |
+
UMM/Show-o/show-o2/docs/demo1.png filter=lfs diff=lfs merge=lfs -text
|
| 7812 |
+
UMM/Show-o/show-o2/docs/demo2.png filter=lfs diff=lfs merge=lfs -text
|
| 7813 |
+
UMM/Show-o/show-o2/docs/demo3.png filter=lfs diff=lfs merge=lfs -text
|
| 7814 |
+
UMM/Show-o/show-o2/docs/mmu/hanjingcenter.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7815 |
+
UMM/Show-o/show-o2/docs/mmu/pexels-fotios-photos-2923436.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7816 |
+
UMM/Show-o/show-o2/docs/mmu/pexels-jane-pham-727419-1571673.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7817 |
+
UMM/Show-o/show-o2/docs/mmu/pexels-mccutcheon-1148998.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7818 |
+
UMM/Show-o/show-o2/docs/mmu/pexels-muffin-1558665.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7819 |
+
UMM/Show-o/show-o2/docs/mmu/pexels-pixabay-207983.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7820 |
+
UMM/Show-o/show-o2/docs/mmu/pexels-psco-1071882.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7821 |
+
UMM/Show-o/show-o2/docs/mmu/pexels-talha-ahmed-26040377-7949588.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7822 |
+
UMM/Show-o/show-o2/docs/mmu/pexels-taryn-elliott-4144459.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7823 |
+
UMM/Show-o/show-o2/docs/mmu/pexels-thelazyartist-1117485.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7824 |
+
UMM/Show-o/show-o2/docs/overview.png filter=lfs diff=lfs merge=lfs -text
|
| 7825 |
+
UMM/Show-o/show-o2/docs/videos/i2v_1.gif filter=lfs diff=lfs merge=lfs -text
|
| 7826 |
+
UMM/Show-o/show-o2/docs/videos/i2v_2.gif filter=lfs diff=lfs merge=lfs -text
|
| 7827 |
+
UMM/Show-o/show-o2/docs/videos/i2v_3.gif filter=lfs diff=lfs merge=lfs -text
|
| 7828 |
+
UMM/Show-o/show-o2/docs/videos/i2v_4.gif filter=lfs diff=lfs merge=lfs -text
|
| 7829 |
+
UMM/Show-o/show-o2/docs/videos/sky.gif filter=lfs diff=lfs merge=lfs -text
|
| 7830 |
+
UMM/Show-o/show-o2/docs/videos/video1.mp4 filter=lfs diff=lfs merge=lfs -text
|
| 7831 |
+
UMM/Show-o/show-o2/docs/videos/video2.mp4 filter=lfs diff=lfs merge=lfs -text
|
| 7832 |
+
UMM/Show-o/show-o2/docs/videos/waves.gif filter=lfs diff=lfs merge=lfs -text
|
| 7833 |
+
UMM/Show-o/training/questions.json filter=lfs diff=lfs merge=lfs -text
|
| 7834 |
+
UMM/UniVideo/assets/image.png filter=lfs diff=lfs merge=lfs -text
|
| 7835 |
+
UMM/UniVideo/assets/teaser.gif filter=lfs diff=lfs merge=lfs -text
|
| 7836 |
+
UMM/UniVideo/assets/teaser.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7837 |
+
UMM/UniVideo/demo/i2v/1.png filter=lfs diff=lfs merge=lfs -text
|
| 7838 |
+
UMM/UniVideo/demo/i2v/output.gif filter=lfs diff=lfs merge=lfs -text
|
| 7839 |
+
UMM/UniVideo/demo/i2v/output.mp4 filter=lfs diff=lfs merge=lfs -text
|
| 7840 |
+
UMM/UniVideo/demo/in_context_image_edit/input.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7841 |
+
UMM/UniVideo/demo/in_context_video_edit/id_addition/output.gif filter=lfs diff=lfs merge=lfs -text
|
| 7842 |
+
UMM/UniVideo/demo/in_context_video_edit/id_addition/output.mp4 filter=lfs diff=lfs merge=lfs -text
|
| 7843 |
+
UMM/UniVideo/demo/in_context_video_edit/id_addition/reference.gif filter=lfs diff=lfs merge=lfs -text
|
| 7844 |
+
UMM/UniVideo/demo/in_context_video_edit/id_addition/reference.mp4 filter=lfs diff=lfs merge=lfs -text
|
| 7845 |
+
UMM/UniVideo/demo/in_context_video_edit/id_swap/ID.jpeg filter=lfs diff=lfs merge=lfs -text
|
| 7846 |
+
UMM/UniVideo/demo/in_context_video_edit/id_swap/origin.mp4 filter=lfs diff=lfs merge=lfs -text
|
| 7847 |
+
UMM/UniVideo/demo/in_context_video_edit/id_swap/output.mp4 filter=lfs diff=lfs merge=lfs -text
|
| 7848 |
+
UMM/UniVideo/demo/in_context_video_edit/style/output.mp4 filter=lfs diff=lfs merge=lfs -text
|
| 7849 |
+
UMM/UniVideo/demo/in_context_video_edit/style/ref.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7850 |
+
UMM/UniVideo/demo/in_context_video_edit/style/video.mp4 filter=lfs diff=lfs merge=lfs -text
|
| 7851 |
+
UMM/UniVideo/demo/in_context_video_gen/1.png filter=lfs diff=lfs merge=lfs -text
|
| 7852 |
+
UMM/UniVideo/demo/in_context_video_gen/2.png filter=lfs diff=lfs merge=lfs -text
|
| 7853 |
+
UMM/UniVideo/demo/in_context_video_gen/3.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7854 |
+
UMM/UniVideo/demo/in_context_video_gen/output.gif filter=lfs diff=lfs merge=lfs -text
|
| 7855 |
+
UMM/UniVideo/demo/in_context_video_gen/output.mp4 filter=lfs diff=lfs merge=lfs -text
|
| 7856 |
+
UMM/UniVideo/demo/t2i/output.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7857 |
+
UMM/UniVideo/demo/t2v/output.gif filter=lfs diff=lfs merge=lfs -text
|
| 7858 |
+
UMM/UniVideo/demo/t2v/output.mp4 filter=lfs diff=lfs merge=lfs -text
|
| 7859 |
+
UMM/UniVideo/demo/understanding/1.mp4 filter=lfs diff=lfs merge=lfs -text
|
| 7860 |
+
UMM/UniVideo/demo/understanding/input.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7861 |
+
UMM/UniVideo/demo/video_edit/output.mp4 filter=lfs diff=lfs merge=lfs -text
|
| 7862 |
+
UMM/UniVideo/demo/video_edit/style/output.gif filter=lfs diff=lfs merge=lfs -text
|
| 7863 |
+
UMM/UniVideo/demo/video_edit/style/output.mp4 filter=lfs diff=lfs merge=lfs -text
|
| 7864 |
+
UMM/UniVideo/demo/video_edit/video.gif filter=lfs diff=lfs merge=lfs -text
|
| 7865 |
+
UMM/UniVideo/demo/video_edit/video.mp4 filter=lfs diff=lfs merge=lfs -text
|
| 7866 |
+
UMM/UniVideo-Med/assets/image.png filter=lfs diff=lfs merge=lfs -text
|
| 7867 |
+
UMM/UniVideo-Med/assets/teaser.gif filter=lfs diff=lfs merge=lfs -text
|
| 7868 |
+
UMM/UniVideo-Med/assets/teaser.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7869 |
+
UMM/UniVideo-Med/demo/i2v/1.png filter=lfs diff=lfs merge=lfs -text
|
| 7870 |
+
UMM/UniVideo-Med/demo/i2v/output.gif filter=lfs diff=lfs merge=lfs -text
|
| 7871 |
+
UMM/UniVideo-Med/demo/i2v/output.mp4 filter=lfs diff=lfs merge=lfs -text
|
| 7872 |
+
UMM/UniVideo-Med/demo/in_context_image_edit/input.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7873 |
+
UMM/UniVideo-Med/demo/in_context_video_edit/id_addition/output.gif filter=lfs diff=lfs merge=lfs -text
|
| 7874 |
+
UMM/UniVideo-Med/demo/in_context_video_edit/id_addition/output.mp4 filter=lfs diff=lfs merge=lfs -text
|
| 7875 |
+
UMM/UniVideo-Med/demo/in_context_video_edit/id_addition/reference.gif filter=lfs diff=lfs merge=lfs -text
|
| 7876 |
+
UMM/UniVideo-Med/demo/in_context_video_edit/id_addition/reference.mp4 filter=lfs diff=lfs merge=lfs -text
|
| 7877 |
+
UMM/UniVideo-Med/demo/in_context_video_edit/id_swap/ID.jpeg filter=lfs diff=lfs merge=lfs -text
|
| 7878 |
+
UMM/UniVideo-Med/demo/in_context_video_edit/id_swap/origin.mp4 filter=lfs diff=lfs merge=lfs -text
|
| 7879 |
+
UMM/UniVideo-Med/demo/in_context_video_edit/id_swap/output.mp4 filter=lfs diff=lfs merge=lfs -text
|
| 7880 |
+
UMM/UniVideo-Med/demo/in_context_video_edit/style/output.mp4 filter=lfs diff=lfs merge=lfs -text
|
| 7881 |
+
UMM/UniVideo-Med/demo/in_context_video_edit/style/ref.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7882 |
+
UMM/UniVideo-Med/demo/in_context_video_edit/style/video.mp4 filter=lfs diff=lfs merge=lfs -text
|
| 7883 |
+
UMM/UniVideo-Med/demo/in_context_video_gen/1.png filter=lfs diff=lfs merge=lfs -text
|
| 7884 |
+
UMM/UniVideo-Med/demo/in_context_video_gen/2.png filter=lfs diff=lfs merge=lfs -text
|
| 7885 |
+
UMM/UniVideo-Med/demo/in_context_video_gen/3.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7886 |
+
UMM/UniVideo-Med/demo/in_context_video_gen/output.gif filter=lfs diff=lfs merge=lfs -text
|
| 7887 |
+
UMM/UniVideo-Med/demo/in_context_video_gen/output.mp4 filter=lfs diff=lfs merge=lfs -text
|
| 7888 |
+
UMM/UniVideo-Med/demo/t2i/output.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7889 |
+
UMM/UniVideo-Med/demo/t2v/output.gif filter=lfs diff=lfs merge=lfs -text
|
| 7890 |
+
UMM/UniVideo-Med/demo/t2v/output.mp4 filter=lfs diff=lfs merge=lfs -text
|
| 7891 |
+
UMM/UniVideo-Med/demo/understanding/1.mp4 filter=lfs diff=lfs merge=lfs -text
|
| 7892 |
+
UMM/UniVideo-Med/demo/understanding/input.jpg filter=lfs diff=lfs merge=lfs -text
|
| 7893 |
+
UMM/UniVideo-Med/demo/video_edit/output.mp4 filter=lfs diff=lfs merge=lfs -text
|
| 7894 |
+
UMM/UniVideo-Med/demo/video_edit/style/output.gif filter=lfs diff=lfs merge=lfs -text
|
| 7895 |
+
UMM/UniVideo-Med/demo/video_edit/style/output.mp4 filter=lfs diff=lfs merge=lfs -text
|
| 7896 |
+
UMM/UniVideo-Med/demo/video_edit/video.gif filter=lfs diff=lfs merge=lfs -text
|
| 7897 |
+
UMM/UniVideo-Med/demo/video_edit/video.mp4 filter=lfs diff=lfs merge=lfs -text
|
| 7898 |
+
UMM/UniVideo-Med/sharegpt4video_40k.jsonl filter=lfs diff=lfs merge=lfs -text
|
| 7899 |
+
UMM/unsloth/images/Where_Terminal.png filter=lfs diff=lfs merge=lfs -text
|
| 7900 |
+
UMM/unsloth/images/unsloth[[:space:]]end.png filter=lfs diff=lfs merge=lfs -text
|
| 7901 |
+
UMM/unsloth/images/unsloth[[:space:]]loading[[:space:]]page[[:space:]]render.png filter=lfs diff=lfs merge=lfs -text
|
| 7902 |
+
UMM/unsloth/images/unsloth[[:space:]]logo[[:space:]]black[[:space:]]text.png filter=lfs diff=lfs merge=lfs -text
|
| 7903 |
+
UMM/unsloth/images/unsloth[[:space:]]logo[[:space:]]white[[:space:]]text.png filter=lfs diff=lfs merge=lfs -text
|
| 7904 |
+
UMM/unsloth/images/unsloth[[:space:]]sticker.png filter=lfs diff=lfs merge=lfs -text
|
UMM/BLIP3o-Qwen3-Siglip2/README.md
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
## Qwen3 + SigLIP2 + EVACLIP
|
| 3 |
+
|
| 4 |
+
This branch combines:
|
| 5 |
+
- **Qwen3** as the autoregressive backbone (You can choose any size Qwen3 model, 0.6B, 1.7B, 4B, 8B, 14B, 32B)
|
| 6 |
+
- **SigLIP2** for image understanding vision encoder
|
| 7 |
+
- **EVACLIP** for image generation vision encoder
|
| 8 |
+
|
| 9 |
+
You can set up and run this in the same environment as the `main` branch.
|
| 10 |
+
|
| 11 |
+
### Available Training Modes
|
| 12 |
+
- **Image Understanding (I2T)**
|
| 13 |
+
- **Image Generation (T2I)**
|
| 14 |
+
- **Joint Training** (both tasks)
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
To choose different training tasks, update the dataloader in `train.py`:
|
| 18 |
+
- Image generation data [https://github.com/JiuhaiChen/BLIP3o/blob/Qwen3-Siglip2/blip3o/train/train.py#L498]
|
| 19 |
+
- Image understanding data [https://github.com/JiuhaiChen/BLIP3o/blob/Qwen3-Siglip2/blip3o/train/train.py#L512]
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
Specific data type markers in the script:
|
| 23 |
+
- **T2I** (Text-to-Image) [https://github.com/JiuhaiChen/BLIP3o/blob/Qwen3-Siglip2/blip3o/train/train.py#L503]
|
| 24 |
+
- **I2T** (Image-to-Text) [https://github.com/JiuhaiChen/BLIP3o/blob/Qwen3-Siglip2/blip3o/train/train.py#L517]
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
### Freezing the Backbone
|
| 28 |
+
- Add `--freeze_backbone True` in the training script to freeze Qwen3 during training
|
| 29 |
+
- Add `--freeze_backbone False` in the training script to unfreeze Qwen3 during training (we recommend unfreeze backbone when you train image understanding tasks)
|
| 30 |
+
|
| 31 |
+
### Adjust your batch size according to your GPU setup!
|
UMM/BLIP3o-Qwen3-Siglip2/blip3o/.DS_Store
ADDED
|
Binary file (6.15 kB). View file
|
|
|
UMM/BLIP3o-Qwen3-Siglip2/blip3o/__init__.py
ADDED
|
File without changes
|
UMM/BLIP3o-Qwen3-Siglip2/blip3o/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (156 Bytes). View file
|
|
|
UMM/BLIP3o-Qwen3-Siglip2/blip3o/__pycache__/constants.cpython-311.pyc
ADDED
|
Binary file (669 Bytes). View file
|
|
|
UMM/BLIP3o-Qwen3-Siglip2/blip3o/__pycache__/conversation.cpython-311.pyc
ADDED
|
Binary file (19.6 kB). View file
|
|
|
UMM/BLIP3o-Qwen3-Siglip2/blip3o/__pycache__/mm_utils.cpython-311.pyc
ADDED
|
Binary file (14.6 kB). View file
|
|
|
UMM/BLIP3o-Qwen3-Siglip2/blip3o/__pycache__/utils.cpython-311.pyc
ADDED
|
Binary file (6.94 kB). View file
|
|
|
UMM/BLIP3o-Qwen3-Siglip2/blip3o/constants.py
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
CONTROLLER_HEART_BEAT_EXPIRATION = 30
|
| 2 |
+
WORKER_HEART_BEAT_INTERVAL = 15
|
| 3 |
+
|
| 4 |
+
LOGDIR = "."
|
| 5 |
+
|
| 6 |
+
# Model Constants
|
| 7 |
+
IGNORE_INDEX = -100
|
| 8 |
+
# IMAGE_TOKEN_INDEX = -200
|
| 9 |
+
|
| 10 |
+
DEFAULT_IMAGE_TOKEN = "<image>"
|
| 11 |
+
DEFAULT_IM_START_TOKEN = "[IMG]"
|
| 12 |
+
DEFAULT_IM_END_TOKEN = "[/IMG]"
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
# IMAGE_TOKEN_IDX = 32002
|
| 16 |
+
# DEFAULT_IM_START_TOKEN_IDX = 32000
|
| 17 |
+
# DEFAULT_IM_END_TOKEN_IDX = 32001
|
| 18 |
+
|
| 19 |
+
IMAGE_TOKEN_IDX = 151655
|
| 20 |
+
DEFAULT_IM_START_TOKEN_IDX = 151669
|
| 21 |
+
DEFAULT_IM_END_TOKEN_IDX = 151670
|
| 22 |
+
UND_IMAGE_TOKEN_IDX = 151671
|
| 23 |
+
# N_QUERY = 729
|
| 24 |
+
|
| 25 |
+
DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
|
| 26 |
+
IMAGE_PLACEHOLDER = "<image-placeholder>"
|
UMM/BLIP3o-Qwen3-Siglip2/blip3o/conversation.py
ADDED
|
@@ -0,0 +1,476 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import dataclasses
|
| 2 |
+
from enum import auto, Enum
|
| 3 |
+
from typing import List, Tuple
|
| 4 |
+
import base64
|
| 5 |
+
from io import BytesIO
|
| 6 |
+
from PIL import Image
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class SeparatorStyle(Enum):
|
| 10 |
+
"""Different separator style."""
|
| 11 |
+
SINGLE = auto()
|
| 12 |
+
TWO = auto()
|
| 13 |
+
MPT = auto()
|
| 14 |
+
PLAIN = auto()
|
| 15 |
+
LLAMA_2 = auto()
|
| 16 |
+
CHATML = auto()
|
| 17 |
+
QWEN = auto()
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
@dataclasses.dataclass
|
| 21 |
+
class Conversation:
|
| 22 |
+
"""A class that keeps all conversation history."""
|
| 23 |
+
system: str
|
| 24 |
+
roles: List[str]
|
| 25 |
+
messages: List[List[str]]
|
| 26 |
+
offset: int
|
| 27 |
+
sep_style: SeparatorStyle = SeparatorStyle.SINGLE
|
| 28 |
+
sep: str = "###"
|
| 29 |
+
sep2: str = None
|
| 30 |
+
version: str = "Unknown"
|
| 31 |
+
|
| 32 |
+
skip_next: bool = False
|
| 33 |
+
|
| 34 |
+
def get_prompt(self):
|
| 35 |
+
messages = self.messages
|
| 36 |
+
if len(messages) > 0 and type(messages[0][1]) is tuple:
|
| 37 |
+
messages = self.messages.copy()
|
| 38 |
+
init_role, init_msg = messages[0].copy()
|
| 39 |
+
init_msg = init_msg[0]
|
| 40 |
+
if "mmtag" in self.version:
|
| 41 |
+
init_msg = init_msg.replace("<image>", "").strip()
|
| 42 |
+
messages[0] = (init_role, init_msg)
|
| 43 |
+
messages.insert(0, (self.roles[0], "<Image><image></Image>"))
|
| 44 |
+
messages.insert(1, (self.roles[1], "Received."))
|
| 45 |
+
elif not init_msg.startswith("<image>"):
|
| 46 |
+
init_msg = init_msg.replace("<image>", "").strip()
|
| 47 |
+
messages[0] = (init_role, "<image>\n" + init_msg)
|
| 48 |
+
else:
|
| 49 |
+
messages[0] = (init_role, init_msg)
|
| 50 |
+
|
| 51 |
+
if self.sep_style == SeparatorStyle.SINGLE:
|
| 52 |
+
ret = self.system + self.sep
|
| 53 |
+
for role, message in messages:
|
| 54 |
+
if message:
|
| 55 |
+
if type(message) is tuple:
|
| 56 |
+
message, _, _ = message
|
| 57 |
+
ret += role + ": " + message + self.sep
|
| 58 |
+
else:
|
| 59 |
+
ret += role + ":"
|
| 60 |
+
|
| 61 |
+
elif self.sep_style == SeparatorStyle.TWO:
|
| 62 |
+
seps = [self.sep, self.sep2]
|
| 63 |
+
ret = self.system + seps[0]
|
| 64 |
+
for i, (role, message) in enumerate(messages):
|
| 65 |
+
if message:
|
| 66 |
+
if type(message) is tuple:
|
| 67 |
+
message, _, _ = message
|
| 68 |
+
ret += role + ": " + message + seps[i % 2]
|
| 69 |
+
else:
|
| 70 |
+
ret += role + ":"
|
| 71 |
+
|
| 72 |
+
elif self.sep_style == SeparatorStyle.CHATML:
|
| 73 |
+
ret = "" if self.system == "" else self.system + self.sep + "\n"
|
| 74 |
+
for role, message in messages:
|
| 75 |
+
if message:
|
| 76 |
+
if type(message) is tuple:
|
| 77 |
+
message, images, _ = message
|
| 78 |
+
message = "<image>" * len(images) + message
|
| 79 |
+
ret += role + "\n" + message + self.sep + "\n"
|
| 80 |
+
else:
|
| 81 |
+
ret += role + "\n"
|
| 82 |
+
return ret
|
| 83 |
+
|
| 84 |
+
elif self.sep_style == SeparatorStyle.LLAMA_3:
|
| 85 |
+
if self.tokenizer is None:
|
| 86 |
+
raise ValueError("Llama 3 tokenizer is not available. Make sure you have the necessary permissions.")
|
| 87 |
+
chat_template_messages = [{"role": "system", "content": self.system}]
|
| 88 |
+
for role, message in messages:
|
| 89 |
+
if message:
|
| 90 |
+
if type(message) is tuple:
|
| 91 |
+
message, images = message
|
| 92 |
+
message = "<image>" * len(images) + message
|
| 93 |
+
chat_template_messages.append({"role": role, "content": message})
|
| 94 |
+
|
| 95 |
+
# print(chat_template_messages)
|
| 96 |
+
return self.tokenizer.apply_chat_template(chat_template_messages, tokenize=False, add_generation_prompt=True)
|
| 97 |
+
# ret = "" if self.system == "" else self.system + self.sep + "\n"
|
| 98 |
+
# for role, message in messages:
|
| 99 |
+
# if message:
|
| 100 |
+
# if type(message) is tuple:
|
| 101 |
+
# message, images = message
|
| 102 |
+
# message = "<image>" * len(images) + message
|
| 103 |
+
# ret += role + "\n" + message + self.sep + "\n"
|
| 104 |
+
# else:
|
| 105 |
+
# ret += role + "\n"
|
| 106 |
+
# return ret
|
| 107 |
+
|
| 108 |
+
elif self.sep_style == SeparatorStyle.MPT:
|
| 109 |
+
ret = self.system + self.sep
|
| 110 |
+
for role, message in messages:
|
| 111 |
+
if message:
|
| 112 |
+
if type(message) is tuple:
|
| 113 |
+
message, _, _ = message
|
| 114 |
+
ret += role + message + self.sep
|
| 115 |
+
else:
|
| 116 |
+
ret += role
|
| 117 |
+
|
| 118 |
+
elif self.sep_style == SeparatorStyle.GEMMA:
|
| 119 |
+
ret = ""
|
| 120 |
+
for i, (role, message) in enumerate(messages):
|
| 121 |
+
assert role == self.roles[i % 2], "Conversation should alternate user/assistant/user/assistant/..."
|
| 122 |
+
if message:
|
| 123 |
+
if type(message) is tuple:
|
| 124 |
+
message, _, _ = message
|
| 125 |
+
ret += role + message + self.sep
|
| 126 |
+
else:
|
| 127 |
+
ret += role
|
| 128 |
+
|
| 129 |
+
elif self.sep_style == SeparatorStyle.LLAMA_2:
|
| 130 |
+
wrap_sys = lambda msg: f"<<SYS>>\n{msg}\n<</SYS>>\n\n" if len(msg) > 0 else msg
|
| 131 |
+
wrap_inst = lambda msg: f"[INST] {msg} [/INST]"
|
| 132 |
+
ret = ""
|
| 133 |
+
|
| 134 |
+
for i, (role, message) in enumerate(messages):
|
| 135 |
+
if i == 0:
|
| 136 |
+
assert message, "first message should not be none"
|
| 137 |
+
assert role == self.roles[0], "first message should come from user"
|
| 138 |
+
if message:
|
| 139 |
+
if type(message) is tuple:
|
| 140 |
+
message, _, _ = message
|
| 141 |
+
if i == 0:
|
| 142 |
+
message = wrap_sys(self.system) + message
|
| 143 |
+
if i % 2 == 0:
|
| 144 |
+
message = wrap_inst(message)
|
| 145 |
+
ret += self.sep + message
|
| 146 |
+
else:
|
| 147 |
+
ret += " " + message + " " + self.sep2
|
| 148 |
+
else:
|
| 149 |
+
ret += ""
|
| 150 |
+
ret = ret.lstrip(self.sep)
|
| 151 |
+
|
| 152 |
+
elif self.sep_style == SeparatorStyle.PLAIN:
|
| 153 |
+
seps = [self.sep, self.sep2]
|
| 154 |
+
ret = self.system
|
| 155 |
+
for i, (role, message) in enumerate(messages):
|
| 156 |
+
if message:
|
| 157 |
+
if type(message) is tuple:
|
| 158 |
+
message, _, _ = message
|
| 159 |
+
ret += message + seps[i % 2]
|
| 160 |
+
else:
|
| 161 |
+
ret += ""
|
| 162 |
+
else:
|
| 163 |
+
raise ValueError(f"Invalid style: {self.sep_style}")
|
| 164 |
+
|
| 165 |
+
return ret
|
| 166 |
+
|
| 167 |
+
def append_message(self, role, message):
|
| 168 |
+
self.messages.append([role, message])
|
| 169 |
+
|
| 170 |
+
def process_image(self, image, image_process_mode, return_pil=False, image_format='PNG', max_len=1344, min_len=672):
|
| 171 |
+
if image_process_mode == "Pad":
|
| 172 |
+
def expand2square(pil_img, background_color=(122, 116, 104)):
|
| 173 |
+
width, height = pil_img.size
|
| 174 |
+
if width == height:
|
| 175 |
+
return pil_img
|
| 176 |
+
elif width > height:
|
| 177 |
+
result = Image.new(pil_img.mode, (width, width), background_color)
|
| 178 |
+
result.paste(pil_img, (0, (width - height) // 2))
|
| 179 |
+
return result
|
| 180 |
+
else:
|
| 181 |
+
result = Image.new(pil_img.mode, (height, height), background_color)
|
| 182 |
+
result.paste(pil_img, ((height - width) // 2, 0))
|
| 183 |
+
return result
|
| 184 |
+
|
| 185 |
+
image = expand2square(image)
|
| 186 |
+
elif image_process_mode in ["Default", "Crop"]:
|
| 187 |
+
pass
|
| 188 |
+
elif image_process_mode == "Resize":
|
| 189 |
+
image = image.resize((336, 336))
|
| 190 |
+
else:
|
| 191 |
+
raise ValueError(f"Invalid image_process_mode: {image_process_mode}")
|
| 192 |
+
if max(image.size) > max_len:
|
| 193 |
+
max_hw, min_hw = max(image.size), min(image.size)
|
| 194 |
+
aspect_ratio = max_hw / min_hw
|
| 195 |
+
shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw))
|
| 196 |
+
longest_edge = int(shortest_edge * aspect_ratio)
|
| 197 |
+
W, H = image.size
|
| 198 |
+
if H > W:
|
| 199 |
+
H, W = longest_edge, shortest_edge
|
| 200 |
+
else:
|
| 201 |
+
H, W = shortest_edge, longest_edge
|
| 202 |
+
image = image.resize((W, H))
|
| 203 |
+
if return_pil:
|
| 204 |
+
return image
|
| 205 |
+
else:
|
| 206 |
+
buffered = BytesIO()
|
| 207 |
+
image.save(buffered, format=image_format)
|
| 208 |
+
img_b64_str = base64.b64encode(buffered.getvalue()).decode()
|
| 209 |
+
return img_b64_str
|
| 210 |
+
|
| 211 |
+
def get_images(self, return_pil=False):
|
| 212 |
+
images = []
|
| 213 |
+
for i, (role, msg) in enumerate(self.messages[self.offset:]):
|
| 214 |
+
if i % 2 == 0:
|
| 215 |
+
if type(msg) is tuple:
|
| 216 |
+
msg, image, image_process_mode = msg
|
| 217 |
+
image = self.process_image(image, image_process_mode, return_pil=return_pil)
|
| 218 |
+
images.append(image)
|
| 219 |
+
return images
|
| 220 |
+
|
| 221 |
+
def to_gradio_chatbot(self):
|
| 222 |
+
ret = []
|
| 223 |
+
for i, (role, msg) in enumerate(self.messages[self.offset:]):
|
| 224 |
+
if i % 2 == 0:
|
| 225 |
+
if type(msg) is tuple:
|
| 226 |
+
msg, image, image_process_mode = msg
|
| 227 |
+
img_b64_str = self.process_image(
|
| 228 |
+
image, "Default", return_pil=False,
|
| 229 |
+
image_format='JPEG')
|
| 230 |
+
img_str = f'<img src="data:image/jpeg;base64,{img_b64_str}" alt="user upload image" />'
|
| 231 |
+
msg = img_str + msg.replace('<image>', '').strip()
|
| 232 |
+
ret.append([msg, None])
|
| 233 |
+
else:
|
| 234 |
+
ret.append([msg, None])
|
| 235 |
+
else:
|
| 236 |
+
ret[-1][-1] = msg
|
| 237 |
+
return ret
|
| 238 |
+
|
| 239 |
+
def copy(self):
|
| 240 |
+
return Conversation(
|
| 241 |
+
system=self.system,
|
| 242 |
+
roles=self.roles,
|
| 243 |
+
messages=[[x, y] for x, y in self.messages],
|
| 244 |
+
offset=self.offset,
|
| 245 |
+
sep_style=self.sep_style,
|
| 246 |
+
sep=self.sep,
|
| 247 |
+
sep2=self.sep2,
|
| 248 |
+
version=self.version)
|
| 249 |
+
|
| 250 |
+
def dict(self):
|
| 251 |
+
if len(self.get_images()) > 0:
|
| 252 |
+
return {
|
| 253 |
+
"system": self.system,
|
| 254 |
+
"roles": self.roles,
|
| 255 |
+
"messages": [[x, y[0] if type(y) is tuple else y] for x, y in self.messages],
|
| 256 |
+
"offset": self.offset,
|
| 257 |
+
"sep": self.sep,
|
| 258 |
+
"sep2": self.sep2,
|
| 259 |
+
}
|
| 260 |
+
return {
|
| 261 |
+
"system": self.system,
|
| 262 |
+
"roles": self.roles,
|
| 263 |
+
"messages": self.messages,
|
| 264 |
+
"offset": self.offset,
|
| 265 |
+
"sep": self.sep,
|
| 266 |
+
"sep2": self.sep2,
|
| 267 |
+
}
|
| 268 |
+
|
| 269 |
+
|
| 270 |
+
conv_vicuna_v0 = Conversation(
|
| 271 |
+
system="A chat between a curious human and an artificial intelligence assistant. "
|
| 272 |
+
"The assistant gives helpful, detailed, and polite answers to the human's questions.",
|
| 273 |
+
roles=("Human", "Assistant"),
|
| 274 |
+
messages=(
|
| 275 |
+
("Human", "What are the key differences between renewable and non-renewable energy sources?"),
|
| 276 |
+
("Assistant",
|
| 277 |
+
"Renewable energy sources are those that can be replenished naturally in a relatively "
|
| 278 |
+
"short amount of time, such as solar, wind, hydro, geothermal, and biomass. "
|
| 279 |
+
"Non-renewable energy sources, on the other hand, are finite and will eventually be "
|
| 280 |
+
"depleted, such as coal, oil, and natural gas. Here are some key differences between "
|
| 281 |
+
"renewable and non-renewable energy sources:\n"
|
| 282 |
+
"1. Availability: Renewable energy sources are virtually inexhaustible, while non-renewable "
|
| 283 |
+
"energy sources are finite and will eventually run out.\n"
|
| 284 |
+
"2. Environmental impact: Renewable energy sources have a much lower environmental impact "
|
| 285 |
+
"than non-renewable sources, which can lead to air and water pollution, greenhouse gas emissions, "
|
| 286 |
+
"and other negative effects.\n"
|
| 287 |
+
"3. Cost: Renewable energy sources can be more expensive to initially set up, but they typically "
|
| 288 |
+
"have lower operational costs than non-renewable sources.\n"
|
| 289 |
+
"4. Reliability: Renewable energy sources are often more reliable and can be used in more remote "
|
| 290 |
+
"locations than non-renewable sources.\n"
|
| 291 |
+
"5. Flexibility: Renewable energy sources are often more flexible and can be adapted to different "
|
| 292 |
+
"situations and needs, while non-renewable sources are more rigid and inflexible.\n"
|
| 293 |
+
"6. Sustainability: Renewable energy sources are more sustainable over the long term, while "
|
| 294 |
+
"non-renewable sources are not, and their depletion can lead to economic and social instability.\n")
|
| 295 |
+
),
|
| 296 |
+
offset=2,
|
| 297 |
+
sep_style=SeparatorStyle.SINGLE,
|
| 298 |
+
sep="###",
|
| 299 |
+
)
|
| 300 |
+
|
| 301 |
+
conv_vicuna_v1 = Conversation(
|
| 302 |
+
system="A chat between a curious user and an artificial intelligence assistant. "
|
| 303 |
+
"The assistant gives helpful, detailed, and polite answers to the user's questions.",
|
| 304 |
+
roles=("USER", "ASSISTANT"),
|
| 305 |
+
version="v1",
|
| 306 |
+
messages=(),
|
| 307 |
+
offset=0,
|
| 308 |
+
sep_style=SeparatorStyle.TWO,
|
| 309 |
+
sep=" ",
|
| 310 |
+
sep2="</s>",
|
| 311 |
+
)
|
| 312 |
+
|
| 313 |
+
conv_llama_2 = Conversation(
|
| 314 |
+
system="""You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.
|
| 315 |
+
|
| 316 |
+
If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.""",
|
| 317 |
+
roles=("USER", "ASSISTANT"),
|
| 318 |
+
version="llama_v2",
|
| 319 |
+
messages=(),
|
| 320 |
+
offset=0,
|
| 321 |
+
sep_style=SeparatorStyle.LLAMA_2,
|
| 322 |
+
sep="<s>",
|
| 323 |
+
sep2="</s>",
|
| 324 |
+
)
|
| 325 |
+
|
| 326 |
+
|
| 327 |
+
conv_blip3o_llama_2 = Conversation(
|
| 328 |
+
system="You are a helpful language and vision assistant. "
|
| 329 |
+
"You are able to understand the visual content that the user provides, "
|
| 330 |
+
"and assist the user with a variety of tasks using natural language.",
|
| 331 |
+
roles=("USER", "ASSISTANT"),
|
| 332 |
+
version="llama_v2",
|
| 333 |
+
messages=(),
|
| 334 |
+
offset=0,
|
| 335 |
+
sep_style=SeparatorStyle.LLAMA_2,
|
| 336 |
+
sep="<s>",
|
| 337 |
+
sep2="</s>",
|
| 338 |
+
)
|
| 339 |
+
|
| 340 |
+
conv_mpt = Conversation(
|
| 341 |
+
system="""<|im_start|>system
|
| 342 |
+
A conversation between a user and an LLM-based AI assistant. The assistant gives helpful and honest answers.""",
|
| 343 |
+
roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
|
| 344 |
+
version="mpt",
|
| 345 |
+
messages=(),
|
| 346 |
+
offset=0,
|
| 347 |
+
sep_style=SeparatorStyle.MPT,
|
| 348 |
+
sep="<|im_end|>",
|
| 349 |
+
)
|
| 350 |
+
|
| 351 |
+
conv_blip3o_plain = Conversation(
|
| 352 |
+
system="",
|
| 353 |
+
roles=("", ""),
|
| 354 |
+
messages=(
|
| 355 |
+
),
|
| 356 |
+
offset=0,
|
| 357 |
+
sep_style=SeparatorStyle.PLAIN,
|
| 358 |
+
sep="\n",
|
| 359 |
+
)
|
| 360 |
+
|
| 361 |
+
conv_blip3o_v0 = Conversation(
|
| 362 |
+
system="A chat between a curious human and an artificial intelligence assistant. "
|
| 363 |
+
"The assistant gives helpful, detailed, and polite answers to the human's questions.",
|
| 364 |
+
roles=("Human", "Assistant"),
|
| 365 |
+
messages=(
|
| 366 |
+
),
|
| 367 |
+
offset=0,
|
| 368 |
+
sep_style=SeparatorStyle.SINGLE,
|
| 369 |
+
sep="###",
|
| 370 |
+
)
|
| 371 |
+
|
| 372 |
+
conv_blip3o_v0_mmtag = Conversation(
|
| 373 |
+
system="A chat between a curious user and an artificial intelligence assistant. "
|
| 374 |
+
"The assistant is able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language."
|
| 375 |
+
"The visual content will be provided with the following format: <Image>visual content</Image>.",
|
| 376 |
+
roles=("Human", "Assistant"),
|
| 377 |
+
messages=(
|
| 378 |
+
),
|
| 379 |
+
offset=0,
|
| 380 |
+
sep_style=SeparatorStyle.SINGLE,
|
| 381 |
+
sep="###",
|
| 382 |
+
version="v0_mmtag",
|
| 383 |
+
)
|
| 384 |
+
|
| 385 |
+
conv_blip3o_v1 = Conversation(
|
| 386 |
+
system="A chat between a curious human and an artificial intelligence assistant. "
|
| 387 |
+
"The assistant gives helpful, detailed, and polite answers to the human's questions.",
|
| 388 |
+
roles=("USER", "ASSISTANT"),
|
| 389 |
+
version="v1",
|
| 390 |
+
messages=(),
|
| 391 |
+
offset=0,
|
| 392 |
+
sep_style=SeparatorStyle.TWO,
|
| 393 |
+
sep=" ",
|
| 394 |
+
sep2="</s>",
|
| 395 |
+
)
|
| 396 |
+
|
| 397 |
+
conv_blip3o_v1_mmtag = Conversation(
|
| 398 |
+
system="A chat between a curious user and an artificial intelligence assistant. "
|
| 399 |
+
"The assistant is able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language."
|
| 400 |
+
"The visual content will be provided with the following format: <Image>visual content</Image>.",
|
| 401 |
+
roles=("USER", "ASSISTANT"),
|
| 402 |
+
messages=(),
|
| 403 |
+
offset=0,
|
| 404 |
+
sep_style=SeparatorStyle.TWO,
|
| 405 |
+
sep=" ",
|
| 406 |
+
sep2="</s>",
|
| 407 |
+
version="v1_mmtag",
|
| 408 |
+
)
|
| 409 |
+
|
| 410 |
+
conv_mistral_instruct = Conversation(
|
| 411 |
+
system="",
|
| 412 |
+
roles=("USER", "ASSISTANT"),
|
| 413 |
+
version="llama_v2",
|
| 414 |
+
messages=(),
|
| 415 |
+
offset=0,
|
| 416 |
+
sep_style=SeparatorStyle.LLAMA_2,
|
| 417 |
+
sep="",
|
| 418 |
+
sep2="</s>",
|
| 419 |
+
)
|
| 420 |
+
|
| 421 |
+
conv_chatml_direct = Conversation(
|
| 422 |
+
system="""<|im_start|>system
|
| 423 |
+
Answer the questions.""",
|
| 424 |
+
roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
|
| 425 |
+
version="mpt",
|
| 426 |
+
messages=(),
|
| 427 |
+
offset=0,
|
| 428 |
+
sep_style=SeparatorStyle.MPT,
|
| 429 |
+
sep="<|im_end|>",
|
| 430 |
+
)
|
| 431 |
+
|
| 432 |
+
conv_llama3 = Conversation(
|
| 433 |
+
system="""<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nA chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.""",
|
| 434 |
+
roles=("<|start_header_id|>user<|end_header_id|>\n\n", "<|start_header_id|>assistant<|end_header_id|>\n\n"),
|
| 435 |
+
version="llama3",
|
| 436 |
+
messages=(),
|
| 437 |
+
offset=0,
|
| 438 |
+
sep_style=SeparatorStyle.MPT,
|
| 439 |
+
sep="<|eot_id|>",
|
| 440 |
+
)
|
| 441 |
+
|
| 442 |
+
conv_qwen = Conversation(
|
| 443 |
+
system="""<|im_start|>system
|
| 444 |
+
You are a helpful assistant.""",
|
| 445 |
+
roles=("<|im_start|>user", "<|im_start|>assistant"),
|
| 446 |
+
version="qwen",
|
| 447 |
+
messages=[],
|
| 448 |
+
offset=0,
|
| 449 |
+
sep_style=SeparatorStyle.CHATML,
|
| 450 |
+
sep="<|im_end|>",
|
| 451 |
+
)
|
| 452 |
+
|
| 453 |
+
|
| 454 |
+
default_conversation = conv_llama3
|
| 455 |
+
conv_templates = {
|
| 456 |
+
"default": conv_vicuna_v0,
|
| 457 |
+
"v0": conv_vicuna_v0,
|
| 458 |
+
"v1": conv_vicuna_v1,
|
| 459 |
+
"vicuna_v1": conv_vicuna_v1,
|
| 460 |
+
"llama_2": conv_llama_2,
|
| 461 |
+
"mistral_instruct": conv_mistral_instruct,
|
| 462 |
+
"chatml_direct": conv_chatml_direct,
|
| 463 |
+
"mistral_direct": conv_chatml_direct,
|
| 464 |
+
"plain": conv_blip3o_plain,
|
| 465 |
+
"v0_plain": conv_blip3o_plain,
|
| 466 |
+
"blip3o_v0": conv_blip3o_v0,
|
| 467 |
+
"v0_mmtag": conv_blip3o_v0_mmtag,
|
| 468 |
+
"blip3o_v1": conv_blip3o_v1,
|
| 469 |
+
"v1_mmtag": conv_blip3o_v1_mmtag,
|
| 470 |
+
"llama3": conv_llama3,
|
| 471 |
+
"qwen": conv_qwen,
|
| 472 |
+
"mpt": conv_mpt,
|
| 473 |
+
}
|
| 474 |
+
|
| 475 |
+
if __name__ == "__main__":
|
| 476 |
+
print(default_conversation.get_prompt())
|
UMM/BLIP3o-Qwen3-Siglip2/blip3o/mm_utils.py
ADDED
|
@@ -0,0 +1,247 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from PIL import Image
|
| 2 |
+
from io import BytesIO
|
| 3 |
+
import base64
|
| 4 |
+
import torch
|
| 5 |
+
import math
|
| 6 |
+
import ast
|
| 7 |
+
|
| 8 |
+
from transformers import StoppingCriteria
|
| 9 |
+
from blip3o.constants import IMAGE_TOKEN_IDX
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def select_best_resolution(original_size, possible_resolutions):
|
| 13 |
+
"""
|
| 14 |
+
Selects the best resolution from a list of possible resolutions based on the original size.
|
| 15 |
+
|
| 16 |
+
Args:
|
| 17 |
+
original_size (tuple): The original size of the image in the format (width, height).
|
| 18 |
+
possible_resolutions (list): A list of possible resolutions in the format [(width1, height1), (width2, height2), ...].
|
| 19 |
+
|
| 20 |
+
Returns:
|
| 21 |
+
tuple: The best fit resolution in the format (width, height).
|
| 22 |
+
"""
|
| 23 |
+
original_width, original_height = original_size
|
| 24 |
+
best_fit = None
|
| 25 |
+
max_effective_resolution = 0
|
| 26 |
+
min_wasted_resolution = float('inf')
|
| 27 |
+
|
| 28 |
+
for width, height in possible_resolutions:
|
| 29 |
+
scale = min(width / original_width, height / original_height)
|
| 30 |
+
downscaled_width, downscaled_height = int(original_width * scale), int(original_height * scale)
|
| 31 |
+
effective_resolution = min(downscaled_width * downscaled_height, original_width * original_height)
|
| 32 |
+
wasted_resolution = (width * height) - effective_resolution
|
| 33 |
+
|
| 34 |
+
if effective_resolution > max_effective_resolution or (effective_resolution == max_effective_resolution and wasted_resolution < min_wasted_resolution):
|
| 35 |
+
max_effective_resolution = effective_resolution
|
| 36 |
+
min_wasted_resolution = wasted_resolution
|
| 37 |
+
best_fit = (width, height)
|
| 38 |
+
|
| 39 |
+
return best_fit
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
def resize_and_pad_image(image, target_resolution):
|
| 43 |
+
"""
|
| 44 |
+
Resize and pad an image to a target resolution while maintaining aspect ratio.
|
| 45 |
+
|
| 46 |
+
Args:
|
| 47 |
+
image (PIL.Image.Image): The input image.
|
| 48 |
+
target_resolution (tuple): The target resolution (width, height) of the image.
|
| 49 |
+
|
| 50 |
+
Returns:
|
| 51 |
+
PIL.Image.Image: The resized and padded image.
|
| 52 |
+
"""
|
| 53 |
+
original_width, original_height = image.size
|
| 54 |
+
target_width, target_height = target_resolution
|
| 55 |
+
|
| 56 |
+
scale_w = target_width / original_width
|
| 57 |
+
scale_h = target_height / original_height
|
| 58 |
+
|
| 59 |
+
if scale_w < scale_h:
|
| 60 |
+
new_width = target_width
|
| 61 |
+
new_height = min(math.ceil(original_height * scale_w), target_height)
|
| 62 |
+
else:
|
| 63 |
+
new_height = target_height
|
| 64 |
+
new_width = min(math.ceil(original_width * scale_h), target_width)
|
| 65 |
+
|
| 66 |
+
# Resize the image
|
| 67 |
+
resized_image = image.resize((new_width, new_height))
|
| 68 |
+
|
| 69 |
+
new_image = Image.new('RGB', (target_width, target_height), (0, 0, 0))
|
| 70 |
+
paste_x = (target_width - new_width) // 2
|
| 71 |
+
paste_y = (target_height - new_height) // 2
|
| 72 |
+
new_image.paste(resized_image, (paste_x, paste_y))
|
| 73 |
+
|
| 74 |
+
return new_image
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
def divide_to_patches(image, patch_size):
|
| 78 |
+
"""
|
| 79 |
+
Divides an image into patches of a specified size.
|
| 80 |
+
|
| 81 |
+
Args:
|
| 82 |
+
image (PIL.Image.Image): The input image.
|
| 83 |
+
patch_size (int): The size of each patch.
|
| 84 |
+
|
| 85 |
+
Returns:
|
| 86 |
+
list: A list of PIL.Image.Image objects representing the patches.
|
| 87 |
+
"""
|
| 88 |
+
patches = []
|
| 89 |
+
width, height = image.size
|
| 90 |
+
for i in range(0, height, patch_size):
|
| 91 |
+
for j in range(0, width, patch_size):
|
| 92 |
+
box = (j, i, j + patch_size, i + patch_size)
|
| 93 |
+
patch = image.crop(box)
|
| 94 |
+
patches.append(patch)
|
| 95 |
+
|
| 96 |
+
return patches
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
def get_anyres_image_grid_shape(image_size, grid_pinpoints, patch_size):
|
| 100 |
+
"""
|
| 101 |
+
Calculate the shape of the image patch grid after the preprocessing for images of any resolution.
|
| 102 |
+
|
| 103 |
+
Args:
|
| 104 |
+
image_size (tuple): The size of the input image in the format (width, height).
|
| 105 |
+
grid_pinpoints (str): A string representation of a list of possible resolutions.
|
| 106 |
+
patch_size (int): The size of each image patch.
|
| 107 |
+
|
| 108 |
+
Returns:
|
| 109 |
+
tuple: The shape of the image patch grid in the format (width, height).
|
| 110 |
+
"""
|
| 111 |
+
if type(grid_pinpoints) is list:
|
| 112 |
+
possible_resolutions = grid_pinpoints
|
| 113 |
+
else:
|
| 114 |
+
possible_resolutions = ast.literal_eval(grid_pinpoints)
|
| 115 |
+
width, height = select_best_resolution(image_size, possible_resolutions)
|
| 116 |
+
return width // patch_size, height // patch_size
|
| 117 |
+
|
| 118 |
+
|
| 119 |
+
def process_anyres_image(image, processor, grid_pinpoints):
|
| 120 |
+
"""
|
| 121 |
+
Process an image with variable resolutions.
|
| 122 |
+
|
| 123 |
+
Args:
|
| 124 |
+
image (PIL.Image.Image): The input image to be processed.
|
| 125 |
+
processor: The image processor object.
|
| 126 |
+
grid_pinpoints (str): A string representation of a list of possible resolutions.
|
| 127 |
+
|
| 128 |
+
Returns:
|
| 129 |
+
torch.Tensor: A tensor containing the processed image patches.
|
| 130 |
+
"""
|
| 131 |
+
if type(grid_pinpoints) is list:
|
| 132 |
+
possible_resolutions = grid_pinpoints
|
| 133 |
+
else:
|
| 134 |
+
possible_resolutions = ast.literal_eval(grid_pinpoints)
|
| 135 |
+
best_resolution = select_best_resolution(image.size, possible_resolutions)
|
| 136 |
+
image_padded = resize_and_pad_image(image, best_resolution)
|
| 137 |
+
|
| 138 |
+
patches = divide_to_patches(image_padded, processor.crop_size['height'])
|
| 139 |
+
|
| 140 |
+
image_original_resize = image.resize((processor.size['shortest_edge'], processor.size['shortest_edge']))
|
| 141 |
+
|
| 142 |
+
image_patches = [image_original_resize] + patches
|
| 143 |
+
image_patches = [processor.preprocess(image_patch, return_tensors='pt')['pixel_values'][0]
|
| 144 |
+
for image_patch in image_patches]
|
| 145 |
+
return torch.stack(image_patches, dim=0)
|
| 146 |
+
|
| 147 |
+
|
| 148 |
+
def load_image_from_base64(image):
|
| 149 |
+
return Image.open(BytesIO(base64.b64decode(image)))
|
| 150 |
+
|
| 151 |
+
|
| 152 |
+
def expand2square(pil_img, background_color):
|
| 153 |
+
width, height = pil_img.size
|
| 154 |
+
if width == height:
|
| 155 |
+
return pil_img
|
| 156 |
+
elif width > height:
|
| 157 |
+
result = Image.new(pil_img.mode, (width, width), background_color)
|
| 158 |
+
result.paste(pil_img, (0, (width - height) // 2))
|
| 159 |
+
return result
|
| 160 |
+
else:
|
| 161 |
+
result = Image.new(pil_img.mode, (height, height), background_color)
|
| 162 |
+
result.paste(pil_img, ((height - width) // 2, 0))
|
| 163 |
+
return result
|
| 164 |
+
|
| 165 |
+
|
| 166 |
+
def process_images(images, image_processor, model_cfg):
|
| 167 |
+
image_aspect_ratio = getattr(model_cfg, "image_aspect_ratio", None)
|
| 168 |
+
new_images = []
|
| 169 |
+
if image_aspect_ratio == 'pad':
|
| 170 |
+
for image in images:
|
| 171 |
+
image = expand2square(image, tuple(int(x*255) for x in image_processor.image_mean))
|
| 172 |
+
image = image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
|
| 173 |
+
new_images.append(image)
|
| 174 |
+
elif image_aspect_ratio == "anyres":
|
| 175 |
+
for image in images:
|
| 176 |
+
image = process_anyres_image(image, image_processor, model_cfg.image_grid_pinpoints)
|
| 177 |
+
new_images.append(image)
|
| 178 |
+
else:
|
| 179 |
+
return image_processor(images, return_tensors='pt')['pixel_values']
|
| 180 |
+
if all(x.shape == new_images[0].shape for x in new_images):
|
| 181 |
+
new_images = torch.stack(new_images, dim=0)
|
| 182 |
+
return new_images
|
| 183 |
+
|
| 184 |
+
|
| 185 |
+
def tokenizer_image_token(prompt, tokenizer, image_token_index=IMAGE_TOKEN_IDX, return_tensors=None):
|
| 186 |
+
prompt_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split('<image>')]
|
| 187 |
+
|
| 188 |
+
def insert_separator(X, sep):
|
| 189 |
+
return [ele for sublist in zip(X, [sep]*len(X)) for ele in sublist][:-1]
|
| 190 |
+
|
| 191 |
+
input_ids = []
|
| 192 |
+
offset = 0
|
| 193 |
+
if len(prompt_chunks) > 0 and len(prompt_chunks[0]) > 0 and prompt_chunks[0][0] == tokenizer.bos_token_id:
|
| 194 |
+
offset = 1
|
| 195 |
+
input_ids.append(prompt_chunks[0][0])
|
| 196 |
+
|
| 197 |
+
for x in insert_separator(prompt_chunks, [image_token_index] * (offset + 1)):
|
| 198 |
+
input_ids.extend(x[offset:])
|
| 199 |
+
|
| 200 |
+
if return_tensors is not None:
|
| 201 |
+
if return_tensors == 'pt':
|
| 202 |
+
return torch.tensor(input_ids, dtype=torch.long)
|
| 203 |
+
raise ValueError(f'Unsupported tensor type: {return_tensors}')
|
| 204 |
+
return input_ids
|
| 205 |
+
|
| 206 |
+
|
| 207 |
+
def get_model_name_from_path(model_path):
|
| 208 |
+
model_path = model_path.strip("/")
|
| 209 |
+
model_paths = model_path.split("/")
|
| 210 |
+
if model_paths[-1].startswith('checkpoint-'):
|
| 211 |
+
return model_paths[-2] + "_" + model_paths[-1]
|
| 212 |
+
else:
|
| 213 |
+
return model_paths[-1]
|
| 214 |
+
|
| 215 |
+
class KeywordsStoppingCriteria(StoppingCriteria):
|
| 216 |
+
def __init__(self, keywords, tokenizer, input_ids):
|
| 217 |
+
self.keywords = keywords
|
| 218 |
+
self.keyword_ids = []
|
| 219 |
+
self.max_keyword_len = 0
|
| 220 |
+
for keyword in keywords:
|
| 221 |
+
cur_keyword_ids = tokenizer(keyword).input_ids
|
| 222 |
+
if len(cur_keyword_ids) > 1 and cur_keyword_ids[0] == tokenizer.bos_token_id:
|
| 223 |
+
cur_keyword_ids = cur_keyword_ids[1:]
|
| 224 |
+
if len(cur_keyword_ids) > self.max_keyword_len:
|
| 225 |
+
self.max_keyword_len = len(cur_keyword_ids)
|
| 226 |
+
self.keyword_ids.append(torch.tensor(cur_keyword_ids))
|
| 227 |
+
self.tokenizer = tokenizer
|
| 228 |
+
self.start_len = input_ids.shape[1]
|
| 229 |
+
|
| 230 |
+
def call_for_batch(self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
|
| 231 |
+
offset = min(output_ids.shape[1] - self.start_len, self.max_keyword_len)
|
| 232 |
+
self.keyword_ids = [keyword_id.to(output_ids.device) for keyword_id in self.keyword_ids]
|
| 233 |
+
for keyword_id in self.keyword_ids:
|
| 234 |
+
truncated_output_ids = output_ids[0, -keyword_id.shape[0]:]
|
| 235 |
+
if torch.equal(truncated_output_ids, keyword_id):
|
| 236 |
+
return True
|
| 237 |
+
outputs = self.tokenizer.batch_decode(output_ids[:, -offset:], skip_special_tokens=True)[0]
|
| 238 |
+
for keyword in self.keywords:
|
| 239 |
+
if keyword in outputs:
|
| 240 |
+
return True
|
| 241 |
+
return False
|
| 242 |
+
|
| 243 |
+
def __call__(self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
|
| 244 |
+
outputs = []
|
| 245 |
+
for i in range(output_ids.shape[0]):
|
| 246 |
+
outputs.append(self.call_for_batch(output_ids[i].unsqueeze(0), scores))
|
| 247 |
+
return all(outputs)
|
UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/__init__.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .language_model.blip3o_qwen import blip3oQwenForCausalLM, blip3oQwenConfig
|
| 2 |
+
from .language_model.blip3o_qwen_inference import blip3oQwenForInferenceLM, blip3oQwenConfig
|
| 3 |
+
|
| 4 |
+
|
UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (397 Bytes). View file
|
|
|
UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/__pycache__/blip3o_arch.cpython-311.pyc
ADDED
|
Binary file (20.8 kB). View file
|
|
|
UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/__pycache__/builder.cpython-311.pyc
ADDED
|
Binary file (4.06 kB). View file
|
|
|
UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/__pycache__/llava_arch.cpython-311.pyc
ADDED
|
Binary file (19.3 kB). View file
|
|
|
UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/__pycache__/lumina_nextdit2d.cpython-311.pyc
ADDED
|
Binary file (17 kB). View file
|
|
|
UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/__pycache__/nextdit_crossattn.cpython-311.pyc
ADDED
|
Binary file (4.35 kB). View file
|
|
|
UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/apply_delta.py
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Usage:
|
| 3 |
+
python3 -m fastchat.model.apply_delta --base ~/model_weights/llama-7b --target ~/model_weights/vicuna-7b --delta lmsys/vicuna-7b-delta
|
| 4 |
+
"""
|
| 5 |
+
import argparse
|
| 6 |
+
|
| 7 |
+
import torch
|
| 8 |
+
from tqdm import tqdm
|
| 9 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM
|
| 10 |
+
from blip3o import blip3oLlamaForCausalLM
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def apply_delta(base_model_path, target_model_path, delta_path):
|
| 14 |
+
print("Loading base model")
|
| 15 |
+
base = AutoModelForCausalLM.from_pretrained(
|
| 16 |
+
base_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
|
| 17 |
+
|
| 18 |
+
print("Loading delta")
|
| 19 |
+
delta = blip3oLlamaForCausalLM.from_pretrained(delta_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
|
| 20 |
+
delta_tokenizer = AutoTokenizer.from_pretrained(delta_path)
|
| 21 |
+
|
| 22 |
+
print("Applying delta")
|
| 23 |
+
for name, param in tqdm(delta.state_dict().items(), desc="Applying delta"):
|
| 24 |
+
if name not in base.state_dict():
|
| 25 |
+
assert name in ['model.mm_projector.weight', 'model.mm_projector.bias'], f'{name} not in base model'
|
| 26 |
+
continue
|
| 27 |
+
if param.data.shape == base.state_dict()[name].shape:
|
| 28 |
+
param.data += base.state_dict()[name]
|
| 29 |
+
else:
|
| 30 |
+
assert name in ['model.embed_tokens.weight', 'lm_head.weight'], \
|
| 31 |
+
f'{name} dimension mismatch: {param.data.shape} vs {base.state_dict()[name].shape}'
|
| 32 |
+
bparam = base.state_dict()[name]
|
| 33 |
+
param.data[:bparam.shape[0], :bparam.shape[1]] += bparam
|
| 34 |
+
|
| 35 |
+
print("Saving target model")
|
| 36 |
+
delta.save_pretrained(target_model_path)
|
| 37 |
+
delta_tokenizer.save_pretrained(target_model_path)
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
if __name__ == "__main__":
|
| 41 |
+
parser = argparse.ArgumentParser()
|
| 42 |
+
parser.add_argument("--base-model-path", type=str, required=True)
|
| 43 |
+
parser.add_argument("--target-model-path", type=str, required=True)
|
| 44 |
+
parser.add_argument("--delta-path", type=str, required=True)
|
| 45 |
+
|
| 46 |
+
args = parser.parse_args()
|
| 47 |
+
|
| 48 |
+
apply_delta(args.base_model_path, args.target_model_path, args.delta_path)
|
UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/blip3o_arch.py
ADDED
|
@@ -0,0 +1,371 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from abc import ABC, abstractmethod
|
| 2 |
+
|
| 3 |
+
import torch
|
| 4 |
+
import torch.nn as nn
|
| 5 |
+
import torch.nn.functional as F
|
| 6 |
+
|
| 7 |
+
from .multimodal_encoder.builder import build_vision_tower, build_gen_vision_tower, build_dit
|
| 8 |
+
from .multimodal_projector.builder import build_vision_projector, build_down_projector, build_gen_vision_projector
|
| 9 |
+
|
| 10 |
+
from blip3o.constants import IGNORE_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN, IMAGE_TOKEN_IDX, DEFAULT_IM_START_TOKEN_IDX, DEFAULT_IM_END_TOKEN_IDX, UND_IMAGE_TOKEN_IDX
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class blip3oMetaModel:
|
| 15 |
+
|
| 16 |
+
def __init__(self, config):
|
| 17 |
+
super(blip3oMetaModel, self).__init__(config)
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
if hasattr(config, "gen_vision_tower"):
|
| 22 |
+
self.gen_vision_tower = build_gen_vision_tower(config, delay_load=True)
|
| 23 |
+
self.latent_queries = nn.Parameter(torch.randn(1, config.n_query, config.hidden_size))
|
| 24 |
+
print(f" latent query size {self.latent_queries.shape}")
|
| 25 |
+
|
| 26 |
+
if 'unpad' in getattr(config, 'mm_patch_merge_type', ''):
|
| 27 |
+
self.image_newline = nn.Parameter(
|
| 28 |
+
torch.empty(config.hidden_size, dtype=self.dtype)
|
| 29 |
+
)
|
| 30 |
+
|
| 31 |
+
self.dit, self.noise_scheduler = build_dit(config)
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def get_vision_tower(self):
|
| 35 |
+
vision_tower = getattr(self, 'vision_tower', None)
|
| 36 |
+
if type(vision_tower) is list:
|
| 37 |
+
vision_tower = vision_tower[0]
|
| 38 |
+
return vision_tower
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def get_gen_vision_tower(self):
|
| 42 |
+
gen_vision_tower = getattr(self, 'gen_vision_tower', None)
|
| 43 |
+
if type(gen_vision_tower) is list:
|
| 44 |
+
gen_vision_tower = gen_vision_tower[0]
|
| 45 |
+
return gen_vision_tower
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
def initialize_vision_modules(self, model_args, fsdp=None):
|
| 49 |
+
gen_vision_tower = model_args.gen_vision_tower
|
| 50 |
+
|
| 51 |
+
mm_vision_select_layer = model_args.mm_vision_select_layer
|
| 52 |
+
mm_vision_select_feature = model_args.mm_vision_select_feature
|
| 53 |
+
|
| 54 |
+
pretrain_mm_mlp_adapter = model_args.pretrain_mm_mlp_adapter
|
| 55 |
+
pretrain_gen_mlp_adapter = model_args.pretrain_gen_mlp_adapter
|
| 56 |
+
|
| 57 |
+
mm_patch_merge_type = model_args.mm_patch_merge_type
|
| 58 |
+
|
| 59 |
+
self.config.gen_vision_tower = gen_vision_tower
|
| 60 |
+
self.config.vision_tower_pretrained = getattr(model_args, "vision_tower_pretrained", "")
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
if getattr(self, 'dit', None) is None:
|
| 65 |
+
print("random initiation the DiT !!!")
|
| 66 |
+
self.dit, self.noise_scheduler = build_dit(model_args)
|
| 67 |
+
else:
|
| 68 |
+
print("DiT load from checkpoint!!!")
|
| 69 |
+
for p in self.dit.parameters():
|
| 70 |
+
p.requires_grad = True
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
if self.get_vision_tower() is None:
|
| 74 |
+
vision_tower = build_vision_tower(model_args)
|
| 75 |
+
if fsdp is not None and len(fsdp) > 0:
|
| 76 |
+
self.vision_tower = [vision_tower]
|
| 77 |
+
else:
|
| 78 |
+
self.vision_tower = vision_tower
|
| 79 |
+
else:
|
| 80 |
+
if fsdp is not None and len(fsdp) > 0:
|
| 81 |
+
vision_tower = self.vision_tower[0]
|
| 82 |
+
else:
|
| 83 |
+
vision_tower = self.vision_tower
|
| 84 |
+
vision_tower.load_model()
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
if self.get_gen_vision_tower() is None:
|
| 89 |
+
gen_vision_tower = build_gen_vision_tower(model_args)
|
| 90 |
+
|
| 91 |
+
if fsdp is not None and len(fsdp) > 0:
|
| 92 |
+
self.gen_vision_tower = [gen_vision_tower]
|
| 93 |
+
else:
|
| 94 |
+
self.gen_vision_tower = gen_vision_tower
|
| 95 |
+
else:
|
| 96 |
+
if fsdp is not None and len(fsdp) > 0:
|
| 97 |
+
gen_vision_tower = self.gen_vision_tower[0]
|
| 98 |
+
else:
|
| 99 |
+
gen_vision_tower = self.gen_vision_tower
|
| 100 |
+
gen_vision_tower.load_model()
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
self.config.use_mm_proj = True
|
| 104 |
+
self.config.mm_projector_type = getattr(model_args, 'mm_projector_type', 'linear')
|
| 105 |
+
|
| 106 |
+
self.config.mm_hidden_size = vision_tower.config.hidden_size
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
if 'eva' in model_args.gen_vision_tower:
|
| 110 |
+
self.config.gen_hidden_size = gen_vision_tower.hidden_size
|
| 111 |
+
elif 'siglip2' in model_args.gen_vision_tower:
|
| 112 |
+
self.config.gen_hidden_size = gen_vision_tower.config.hidden_size
|
| 113 |
+
|
| 114 |
+
|
| 115 |
+
self.config.mm_vision_select_layer = mm_vision_select_layer
|
| 116 |
+
self.config.mm_vision_select_feature = mm_vision_select_feature
|
| 117 |
+
self.config.mm_patch_merge_type = mm_patch_merge_type
|
| 118 |
+
self.config.n_query = model_args.n_query
|
| 119 |
+
self.config.gen_pooling = model_args.gen_pooling
|
| 120 |
+
|
| 121 |
+
|
| 122 |
+
if getattr(self, 'mm_projector', None) is None:
|
| 123 |
+
print("random initiation the image understanding projection !!!")
|
| 124 |
+
self.mm_projector = build_vision_projector(self.config)
|
| 125 |
+
if 'unpad' in mm_patch_merge_type:
|
| 126 |
+
embed_std = 1 / torch.sqrt(torch.tensor(self.config.hidden_size, dtype=self.dtype))
|
| 127 |
+
self.image_newline = nn.Parameter(
|
| 128 |
+
torch.randn(self.config.hidden_size, dtype=self.dtype) * embed_std
|
| 129 |
+
)
|
| 130 |
+
else:
|
| 131 |
+
print("Image understanding projection load from checkpoint!!!")
|
| 132 |
+
for p in self.mm_projector.parameters():
|
| 133 |
+
p.requires_grad = True
|
| 134 |
+
|
| 135 |
+
|
| 136 |
+
|
| 137 |
+
if getattr(self, 'down_projector', None) is None:
|
| 138 |
+
self.down_projector = build_down_projector(self.config)
|
| 139 |
+
else:
|
| 140 |
+
for p in self.down_projector.parameters():
|
| 141 |
+
p.requires_grad = True
|
| 142 |
+
|
| 143 |
+
|
| 144 |
+
|
| 145 |
+
if getattr(self, 'latent_queries', None) is None:
|
| 146 |
+
print("random initiation the latent_queries !!!")
|
| 147 |
+
self.latent_queries = nn.Parameter(torch.randn(1, self.config.n_query, self.config.hidden_size))
|
| 148 |
+
else:
|
| 149 |
+
print("latent_queries load from checkpoint!!!")
|
| 150 |
+
self.latent_queries.requires_grad = True
|
| 151 |
+
|
| 152 |
+
|
| 153 |
+
if pretrain_mm_mlp_adapter is not None:
|
| 154 |
+
mm_projector_weights = torch.load(pretrain_mm_mlp_adapter, map_location='cpu')
|
| 155 |
+
def get_w(weights, keyword):
|
| 156 |
+
return {k.split(keyword + '.')[1]: v for k, v in weights.items() if keyword in k}
|
| 157 |
+
|
| 158 |
+
|
| 159 |
+
|
| 160 |
+
def unpad_image(tensor, original_size):
|
| 161 |
+
"""
|
| 162 |
+
Unpads a PyTorch tensor of a padded and resized image.
|
| 163 |
+
|
| 164 |
+
Args:
|
| 165 |
+
tensor (torch.Tensor): The image tensor, assumed to be in CxHxW format.
|
| 166 |
+
original_size (tuple): The original size of PIL image (width, height).
|
| 167 |
+
|
| 168 |
+
Returns:
|
| 169 |
+
torch.Tensor: The unpadded image tensor.
|
| 170 |
+
"""
|
| 171 |
+
original_width, original_height = original_size
|
| 172 |
+
current_height, current_width = tensor.shape[1:]
|
| 173 |
+
|
| 174 |
+
original_aspect_ratio = original_width / original_height
|
| 175 |
+
current_aspect_ratio = current_width / current_height
|
| 176 |
+
|
| 177 |
+
if original_aspect_ratio > current_aspect_ratio:
|
| 178 |
+
scale_factor = current_width / original_width
|
| 179 |
+
new_height = int(original_height * scale_factor)
|
| 180 |
+
padding = (current_height - new_height) // 2
|
| 181 |
+
unpadded_tensor = tensor[:, padding:current_height - padding, :]
|
| 182 |
+
else:
|
| 183 |
+
scale_factor = current_height / original_height
|
| 184 |
+
new_width = int(original_width * scale_factor)
|
| 185 |
+
padding = (current_width - new_width) // 2
|
| 186 |
+
unpadded_tensor = tensor[:, :, padding:current_width - padding]
|
| 187 |
+
|
| 188 |
+
return unpadded_tensor
|
| 189 |
+
|
| 190 |
+
|
| 191 |
+
class blip3oMetaForCausalLM(ABC):
|
| 192 |
+
|
| 193 |
+
@abstractmethod
|
| 194 |
+
def get_model(self):
|
| 195 |
+
pass
|
| 196 |
+
|
| 197 |
+
def get_vision_tower(self):
|
| 198 |
+
return self.get_model().get_vision_tower()
|
| 199 |
+
|
| 200 |
+
def get_gen_vision_tower(self):
|
| 201 |
+
return self.get_model().get_gen_vision_tower()
|
| 202 |
+
|
| 203 |
+
def encode_image(self, images):
|
| 204 |
+
|
| 205 |
+
gen_vision_tower = self.get_gen_vision_tower()
|
| 206 |
+
device = gen_vision_tower.device
|
| 207 |
+
images = images.to(device)
|
| 208 |
+
prompt_image_embeds = gen_vision_tower(images)
|
| 209 |
+
prompt_image_embeds = self.pool_img(prompt_image_embeds)
|
| 210 |
+
num_img, _, c = prompt_image_embeds.shape
|
| 211 |
+
# prompt_image_embeds = prompt_image_embeds.contiguous().view(-1, c)
|
| 212 |
+
|
| 213 |
+
# ------------- compute similarity -------
|
| 214 |
+
all_dist = 0
|
| 215 |
+
count = 0
|
| 216 |
+
for i in range(2, prompt_image_embeds.shape[1]-1):
|
| 217 |
+
diff = (prompt_image_embeds[:,i,:].unsqueeze(1) - prompt_image_embeds[:,:i,:])
|
| 218 |
+
dist = torch.sqrt(diff.square().sum(-1)).min().item()
|
| 219 |
+
all_dist+=dist
|
| 220 |
+
count+=1
|
| 221 |
+
all_dist /= count
|
| 222 |
+
# self.dist = all_dist
|
| 223 |
+
# print(self.dist)
|
| 224 |
+
|
| 225 |
+
return prompt_image_embeds
|
| 226 |
+
|
| 227 |
+
def get_mm_projector(self):
|
| 228 |
+
return self.get_model().mm_projector
|
| 229 |
+
|
| 230 |
+
|
| 231 |
+
def get_n_query(self):
|
| 232 |
+
return self.get_model().config.n_query
|
| 233 |
+
|
| 234 |
+
def get_gen_pooling(self):
|
| 235 |
+
return self.get_model().config.gen_pooling
|
| 236 |
+
|
| 237 |
+
|
| 238 |
+
|
| 239 |
+
def pool_img(self, image_features):
|
| 240 |
+
num_img, n, c = image_features.shape
|
| 241 |
+
gen_pooling = self.get_gen_pooling()
|
| 242 |
+
n_query = self.get_n_query()
|
| 243 |
+
stride = int(gen_pooling.split('_')[-1])
|
| 244 |
+
sqrt_n = int(n**0.5)
|
| 245 |
+
image_features = image_features.view(-1, sqrt_n, sqrt_n, c)
|
| 246 |
+
image_features = (
|
| 247 |
+
nn.functional.adaptive_avg_pool2d(image_features.permute(0, 3, 1, 2), int(n_query**0.5))
|
| 248 |
+
)
|
| 249 |
+
return image_features
|
| 250 |
+
|
| 251 |
+
def get_sigmas(self, timesteps, device, n_dim=4, dtype=torch.float32):
|
| 252 |
+
sigmas = self.get_model().noise_scheduler.sigmas.to(device=device, dtype=dtype)
|
| 253 |
+
schedule_timesteps = self.get_model().noise_scheduler.timesteps.to(device=device)
|
| 254 |
+
timesteps = timesteps.to(device)
|
| 255 |
+
step_indices = [(schedule_timesteps == t).nonzero().item() for t in timesteps]
|
| 256 |
+
|
| 257 |
+
sigma = sigmas[step_indices].flatten()
|
| 258 |
+
while len(sigma.shape) < n_dim:
|
| 259 |
+
sigma = sigma.unsqueeze(-1)
|
| 260 |
+
return sigma
|
| 261 |
+
|
| 262 |
+
def mask_drop(self, latents, drop_prob=0.1):
|
| 263 |
+
if drop_prob <= 0:
|
| 264 |
+
return latents
|
| 265 |
+
mask = torch.bernoulli(torch.zeros(latents.shape[0], device=latents.device, dtype=latents.dtype) + drop_prob)
|
| 266 |
+
while len(mask.shape) < len(latents.shape):
|
| 267 |
+
mask = mask.unsqueeze(-1)
|
| 268 |
+
mask = 1 - mask # need to flip 0 <-> 1
|
| 269 |
+
return latents * mask
|
| 270 |
+
|
| 271 |
+
def prepare_inputs_labels_for_multimodal(
|
| 272 |
+
self, input_ids, position_ids, attention_mask, past_key_values, labels,
|
| 273 |
+
gen_images, und_images, i_s_pos, image_sizes=None
|
| 274 |
+
):
|
| 275 |
+
|
| 276 |
+
vision_tower = self.get_vision_tower()
|
| 277 |
+
mm_projector = self.get_mm_projector()
|
| 278 |
+
gen_vision_tower = self.get_gen_vision_tower()
|
| 279 |
+
if (gen_images is None and und_images is None) or input_ids.shape[1] == 1:
|
| 280 |
+
return input_ids, position_ids, attention_mask, past_key_values, None, labels, None, None, None
|
| 281 |
+
|
| 282 |
+
|
| 283 |
+
|
| 284 |
+
if not gen_images is None:
|
| 285 |
+
prompt_image_embeds = gen_vision_tower(gen_images) # prompt_image_embeds = gen_vision_tower(gen_images).last_hidden_state
|
| 286 |
+
## pooling
|
| 287 |
+
prompt_image_embeds = self.pool_img(prompt_image_embeds)
|
| 288 |
+
target_image_embeds = torch.clone(prompt_image_embeds).detach()
|
| 289 |
+
latent_queries = self.get_model().latent_queries.repeat(gen_images.shape[0], 1, 1)
|
| 290 |
+
H = latent_queries.shape[-1]
|
| 291 |
+
latent_queries = latent_queries.contiguous().view(-1, H)
|
| 292 |
+
else:
|
| 293 |
+
target_image_embeds = None
|
| 294 |
+
|
| 295 |
+
|
| 296 |
+
|
| 297 |
+
if not und_images is None:
|
| 298 |
+
|
| 299 |
+
und_image_embeds = vision_tower(und_images).last_hidden_state
|
| 300 |
+
num_img, _, c = und_image_embeds.shape
|
| 301 |
+
und_image_embeds = und_image_embeds.contiguous().view(-1, c)
|
| 302 |
+
und_image_embeds = mm_projector(und_image_embeds)
|
| 303 |
+
|
| 304 |
+
|
| 305 |
+
image_idx = (input_ids == IMAGE_TOKEN_IDX)
|
| 306 |
+
und_image_idx = (input_ids == UND_IMAGE_TOKEN_IDX)
|
| 307 |
+
output_indicator = labels != -100
|
| 308 |
+
input_indicator = labels == -100
|
| 309 |
+
|
| 310 |
+
|
| 311 |
+
text_embeds = self.get_model().embed_tokens(input_ids)
|
| 312 |
+
text_embeds = text_embeds.clone()
|
| 313 |
+
gen_img_idx = torch.logical_and(output_indicator, image_idx)
|
| 314 |
+
if not gen_images is None:
|
| 315 |
+
text_embeds[gen_img_idx] = latent_queries
|
| 316 |
+
|
| 317 |
+
|
| 318 |
+
und_img_idx = torch.logical_and(input_indicator, und_image_idx)
|
| 319 |
+
if not und_images is None:
|
| 320 |
+
text_embeds[und_img_idx] = und_image_embeds.to(text_embeds.device)[:und_img_idx.sum(), :]
|
| 321 |
+
|
| 322 |
+
labels[image_idx] = -100
|
| 323 |
+
|
| 324 |
+
|
| 325 |
+
return None, position_ids, attention_mask, past_key_values, text_embeds, labels, target_image_embeds
|
| 326 |
+
|
| 327 |
+
|
| 328 |
+
|
| 329 |
+
def initialize_vision_tokenizer(self, model_args, tokenizer):
|
| 330 |
+
if model_args.mm_use_im_patch_token:
|
| 331 |
+
tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True)
|
| 332 |
+
self.resize_token_embeddings(len(tokenizer))
|
| 333 |
+
|
| 334 |
+
if model_args.mm_use_im_start_end:
|
| 335 |
+
num_new_tokens = tokenizer.add_tokens([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True)
|
| 336 |
+
self.resize_token_embeddings(len(tokenizer))
|
| 337 |
+
|
| 338 |
+
if num_new_tokens > 0:
|
| 339 |
+
input_embeddings = self.get_input_embeddings().weight.data
|
| 340 |
+
output_embeddings = self.get_output_embeddings().weight.data
|
| 341 |
+
|
| 342 |
+
input_embeddings_avg = input_embeddings[:-num_new_tokens].mean(
|
| 343 |
+
dim=0, keepdim=True)
|
| 344 |
+
output_embeddings_avg = output_embeddings[:-num_new_tokens].mean(
|
| 345 |
+
dim=0, keepdim=True)
|
| 346 |
+
|
| 347 |
+
input_embeddings[-num_new_tokens:] = input_embeddings_avg
|
| 348 |
+
output_embeddings[-num_new_tokens:] = output_embeddings_avg
|
| 349 |
+
|
| 350 |
+
if model_args.tune_mm_mlp_adapter:
|
| 351 |
+
for p in self.get_input_embeddings().parameters():
|
| 352 |
+
p.requires_grad = True
|
| 353 |
+
for p in self.get_output_embeddings().parameters():
|
| 354 |
+
p.requires_grad = False
|
| 355 |
+
|
| 356 |
+
if model_args.pretrain_mm_mlp_adapter:
|
| 357 |
+
mm_projector_weights = torch.load(model_args.pretrain_mm_mlp_adapter, map_location='cpu')
|
| 358 |
+
embed_tokens_weight = mm_projector_weights['model.embed_tokens.weight']
|
| 359 |
+
assert num_new_tokens == 2
|
| 360 |
+
if input_embeddings.shape == embed_tokens_weight.shape:
|
| 361 |
+
input_embeddings[-num_new_tokens:] = embed_tokens_weight[-num_new_tokens:]
|
| 362 |
+
elif embed_tokens_weight.shape[0] == num_new_tokens:
|
| 363 |
+
input_embeddings[-num_new_tokens:] = embed_tokens_weight
|
| 364 |
+
else:
|
| 365 |
+
raise ValueError(f"Unexpected embed_tokens_weight shape. Pretrained: {embed_tokens_weight.shape}. Current: {input_embeddings.shape}. Numer of new tokens: {num_new_tokens}.")
|
| 366 |
+
elif model_args.mm_use_im_patch_token:
|
| 367 |
+
if model_args.tune_mm_mlp_adapter:
|
| 368 |
+
for p in self.get_input_embeddings().parameters():
|
| 369 |
+
p.requires_grad = False
|
| 370 |
+
for p in self.get_output_embeddings().parameters():
|
| 371 |
+
p.requires_grad = False
|
UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/builder.py
ADDED
|
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import warnings
|
| 3 |
+
import shutil
|
| 4 |
+
|
| 5 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig, BitsAndBytesConfig
|
| 6 |
+
import torch
|
| 7 |
+
from blip3o.model import *
|
| 8 |
+
from blip3o.constants import DEFAULT_IMAGE_PATCH_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
|
| 9 |
+
from blip3o.train.train import smart_tokenizer_and_embedding_resize
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def load_pretrained_model(model_path, load_8bit=False, load_4bit=False, device_map="auto", device="cuda", use_flash_attn=False, **kwargs):
|
| 13 |
+
kwargs = {"device_map": device_map, **kwargs}
|
| 14 |
+
|
| 15 |
+
if device != "cuda":
|
| 16 |
+
kwargs['device_map'] = {"": device}
|
| 17 |
+
|
| 18 |
+
if load_8bit:
|
| 19 |
+
kwargs['load_in_8bit'] = True
|
| 20 |
+
elif load_4bit:
|
| 21 |
+
kwargs['load_in_4bit'] = True
|
| 22 |
+
kwargs['quantization_config'] = BitsAndBytesConfig(
|
| 23 |
+
load_in_4bit=True,
|
| 24 |
+
bnb_4bit_compute_dtype=torch.float16,
|
| 25 |
+
bnb_4bit_use_double_quant=True,
|
| 26 |
+
bnb_4bit_quant_type='nf4'
|
| 27 |
+
)
|
| 28 |
+
else:
|
| 29 |
+
kwargs['torch_dtype'] = torch.float16
|
| 30 |
+
|
| 31 |
+
if use_flash_attn:
|
| 32 |
+
kwargs['attn_implementation'] = 'flash_attention_2'
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
|
| 36 |
+
|
| 37 |
+
model = blip3oQwenForInferenceLM.from_pretrained(model_path, low_cpu_mem_usage=True, torch_dtype=torch.float16).to('cuda:0')
|
| 38 |
+
|
| 39 |
+
image_processor = None
|
| 40 |
+
mm_use_im_start_end = getattr(model.config, "mm_use_im_start_end", False)
|
| 41 |
+
mm_use_im_patch_token = getattr(model.config, "mm_use_im_patch_token", True)
|
| 42 |
+
if mm_use_im_patch_token:
|
| 43 |
+
tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True)
|
| 44 |
+
if mm_use_im_start_end:
|
| 45 |
+
tokenizer.add_tokens([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True)
|
| 46 |
+
model.resize_token_embeddings(len(tokenizer))
|
| 47 |
+
|
| 48 |
+
if hasattr(model.config, "max_sequence_length"):
|
| 49 |
+
context_len = model.config.max_sequence_length
|
| 50 |
+
else:
|
| 51 |
+
context_len = 2048
|
| 52 |
+
|
| 53 |
+
return tokenizer, model, context_len
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
def load_pretrained_model_lmms_eval(model_path, load_8bit=False, load_4bit=False, device_map="auto", device="cuda", use_flash_attn=False, **kwargs):
|
| 62 |
+
kwargs = {"device_map": device_map, **kwargs}
|
| 63 |
+
|
| 64 |
+
if device != "cuda":
|
| 65 |
+
kwargs['device_map'] = {"": device}
|
| 66 |
+
|
| 67 |
+
if load_8bit:
|
| 68 |
+
kwargs['load_in_8bit'] = True
|
| 69 |
+
elif load_4bit:
|
| 70 |
+
kwargs['load_in_4bit'] = True
|
| 71 |
+
kwargs['quantization_config'] = BitsAndBytesConfig(
|
| 72 |
+
load_in_4bit=True,
|
| 73 |
+
bnb_4bit_compute_dtype=torch.float16,
|
| 74 |
+
bnb_4bit_use_double_quant=True,
|
| 75 |
+
bnb_4bit_quant_type='nf4'
|
| 76 |
+
)
|
| 77 |
+
else:
|
| 78 |
+
kwargs['torch_dtype'] = torch.float16
|
| 79 |
+
|
| 80 |
+
if use_flash_attn:
|
| 81 |
+
kwargs['attn_implementation'] = 'flash_attention_2'
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
tokenizer = AutoTokenizer.from_pretrained(model_path)
|
| 85 |
+
|
| 86 |
+
model = blip3oQwenForInferenceLM.from_pretrained(model_path, low_cpu_mem_usage=True, torch_dtype=torch.float16)
|
| 87 |
+
|
| 88 |
+
image_processor = None
|
| 89 |
+
mm_use_im_start_end = getattr(model.config, "mm_use_im_start_end", False)
|
| 90 |
+
mm_use_im_patch_token = getattr(model.config, "mm_use_im_patch_token", True)
|
| 91 |
+
if mm_use_im_patch_token:
|
| 92 |
+
tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True)
|
| 93 |
+
if mm_use_im_start_end:
|
| 94 |
+
tokenizer.add_tokens([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True)
|
| 95 |
+
model.resize_token_embeddings(len(tokenizer))
|
| 96 |
+
|
| 97 |
+
if hasattr(model.config, "max_sequence_length"):
|
| 98 |
+
context_len = model.config.max_sequence_length
|
| 99 |
+
else:
|
| 100 |
+
context_len = 2048
|
| 101 |
+
|
| 102 |
+
return tokenizer, model, context_len
|
| 103 |
+
|
UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/consolidate.py
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import argparse
|
| 2 |
+
|
| 3 |
+
import torch
|
| 4 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM
|
| 5 |
+
from blip3o.model import *
|
| 6 |
+
from blip3o.model.utils import auto_upgrade
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def consolidate_ckpt(src_path, dst_path):
|
| 10 |
+
print("Loading model")
|
| 11 |
+
auto_upgrade(src_path)
|
| 12 |
+
src_model = AutoModelForCausalLM.from_pretrained(src_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
|
| 13 |
+
src_tokenizer = AutoTokenizer.from_pretrained(src_path, use_fast=False)
|
| 14 |
+
src_model.save_pretrained(dst_path)
|
| 15 |
+
src_tokenizer.save_pretrained(dst_path)
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
if __name__ == "__main__":
|
| 19 |
+
parser = argparse.ArgumentParser()
|
| 20 |
+
parser.add_argument("--src", type=str, required=True)
|
| 21 |
+
parser.add_argument("--dst", type=str, required=True)
|
| 22 |
+
|
| 23 |
+
args = parser.parse_args()
|
| 24 |
+
|
| 25 |
+
consolidate_ckpt(args.src, args.dst)
|
UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/language_model/__pycache__/blip3o_llama.cpython-311.pyc
ADDED
|
Binary file (21.3 kB). View file
|
|
|
UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/language_model/__pycache__/blip3o_qwen.cpython-311.pyc
ADDED
|
Binary file (20.5 kB). View file
|
|
|
UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/language_model/__pycache__/blip3o_qwen_inference.cpython-311.pyc
ADDED
|
Binary file (20.7 kB). View file
|
|
|
UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/language_model/__pycache__/llava_llama.cpython-311.pyc
ADDED
|
Binary file (21.3 kB). View file
|
|
|
UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/language_model/__pycache__/llava_qwen.cpython-311.pyc
ADDED
|
Binary file (19.5 kB). View file
|
|
|
UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/language_model/blip3o_qwen.py
ADDED
|
@@ -0,0 +1,421 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import List, Optional, Tuple, Union, Dict
|
| 2 |
+
import torch
|
| 3 |
+
import torch.nn as nn
|
| 4 |
+
from PIL import Image
|
| 5 |
+
import torch.nn.functional as F
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
import transformers
|
| 9 |
+
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
|
| 10 |
+
|
| 11 |
+
from transformers.modeling_outputs import CausalLMOutputWithPast
|
| 12 |
+
from transformers.generation.utils import GenerateOutput
|
| 13 |
+
|
| 14 |
+
from blip3o.model.blip3o_arch import blip3oMetaModel, blip3oMetaForCausalLM
|
| 15 |
+
|
| 16 |
+
from transformers import Qwen3Config, Qwen3Model, Qwen3ForCausalLM
|
| 17 |
+
|
| 18 |
+
from blip3o.constants import UND_IMAGE_TOKEN_IDX, DEFAULT_IM_START_TOKEN_IDX
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
from diffusers.utils.torch_utils import randn_tensor
|
| 23 |
+
from diffusers.pipelines.pipeline_utils import numpy_to_pil
|
| 24 |
+
import numpy as np
|
| 25 |
+
from diffusers.models import AutoencoderKL
|
| 26 |
+
from diffusers.schedulers import FlowMatchEulerDiscreteScheduler
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
class blip3oQwenConfig(Qwen3Config):
|
| 30 |
+
model_type = "blip3o_qwen"
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
class blip3oQwenModel(blip3oMetaModel, Qwen3Model):
|
| 34 |
+
config_class = blip3oQwenConfig
|
| 35 |
+
|
| 36 |
+
def __init__(self, config: Qwen3Config):
|
| 37 |
+
super(blip3oQwenModel, self).__init__(config)
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
class blip3oQwenForCausalLM(Qwen3ForCausalLM, blip3oMetaForCausalLM):
|
| 41 |
+
config_class = blip3oQwenConfig
|
| 42 |
+
|
| 43 |
+
def __init__(self, config):
|
| 44 |
+
Qwen3ForCausalLM.__init__(self, config)
|
| 45 |
+
config.model_type = "blip3o_qwen"
|
| 46 |
+
|
| 47 |
+
self.model = blip3oQwenModel(config)
|
| 48 |
+
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
|
| 49 |
+
# Initialize weights and apply final processing
|
| 50 |
+
self.post_init()
|
| 51 |
+
|
| 52 |
+
def get_model(self):
|
| 53 |
+
return self.model
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
def forward(
|
| 57 |
+
self,
|
| 58 |
+
input_ids: torch.LongTensor = None,
|
| 59 |
+
attention_mask: Optional[torch.Tensor] = None,
|
| 60 |
+
position_ids: Optional[torch.LongTensor] = None,
|
| 61 |
+
past_key_values: Optional[List[torch.FloatTensor]] = None,
|
| 62 |
+
inputs_embeds: Optional[torch.FloatTensor] = None,
|
| 63 |
+
labels: Optional[torch.LongTensor] = None,
|
| 64 |
+
ids: Optional[list] = None,
|
| 65 |
+
i_s_pos: Optional[list] = None,
|
| 66 |
+
use_cache: Optional[bool] = None,
|
| 67 |
+
output_attentions: Optional[bool] = None,
|
| 68 |
+
output_hidden_states: Optional[bool] = None,
|
| 69 |
+
gen_image: Optional[torch.FloatTensor] = None,
|
| 70 |
+
und_image: Optional[torch.FloatTensor] = None,
|
| 71 |
+
image_sizes: Optional[List[List[int]]] = None,
|
| 72 |
+
return_dict: Optional[bool] = None,
|
| 73 |
+
cache_position: Optional[torch.LongTensor] = None
|
| 74 |
+
) -> Union[Tuple, CausalLMOutputWithPast]:
|
| 75 |
+
|
| 76 |
+
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
| 77 |
+
output_hidden_states = (
|
| 78 |
+
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
| 79 |
+
)
|
| 80 |
+
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
| 81 |
+
if inputs_embeds is None:
|
| 82 |
+
(
|
| 83 |
+
input_ids,
|
| 84 |
+
position_ids,
|
| 85 |
+
attention_mask,
|
| 86 |
+
past_key_values,
|
| 87 |
+
inputs_embeds,
|
| 88 |
+
labels,
|
| 89 |
+
latents
|
| 90 |
+
) = self.prepare_inputs_labels_for_multimodal(
|
| 91 |
+
input_ids,
|
| 92 |
+
position_ids,
|
| 93 |
+
attention_mask,
|
| 94 |
+
past_key_values,
|
| 95 |
+
labels,
|
| 96 |
+
gen_image,
|
| 97 |
+
und_image,
|
| 98 |
+
i_s_pos,
|
| 99 |
+
image_sizes
|
| 100 |
+
)
|
| 101 |
+
|
| 102 |
+
outputs = self.model(
|
| 103 |
+
input_ids=input_ids,
|
| 104 |
+
attention_mask=attention_mask,
|
| 105 |
+
position_ids=position_ids,
|
| 106 |
+
past_key_values=past_key_values,
|
| 107 |
+
inputs_embeds=inputs_embeds,
|
| 108 |
+
use_cache=use_cache,
|
| 109 |
+
output_attentions=output_attentions,
|
| 110 |
+
output_hidden_states=output_hidden_states,
|
| 111 |
+
return_dict=return_dict,
|
| 112 |
+
)
|
| 113 |
+
|
| 114 |
+
hidden_states = outputs[0]
|
| 115 |
+
logits = self.lm_head(hidden_states)
|
| 116 |
+
logits = logits.float()
|
| 117 |
+
|
| 118 |
+
|
| 119 |
+
## image understanding loss
|
| 120 |
+
loss = 0
|
| 121 |
+
if und_image is not None:
|
| 122 |
+
# Shift so that tokens < n predict n
|
| 123 |
+
shift_logits = logits[..., :-1, :].contiguous()
|
| 124 |
+
shift_labels = labels[..., 1:].contiguous()
|
| 125 |
+
# Flatten the tokens
|
| 126 |
+
loss_fct = torch.nn.CrossEntropyLoss()
|
| 127 |
+
shift_logits = shift_logits.view(-1, self.config.vocab_size)
|
| 128 |
+
shift_labels = shift_labels.view(-1)
|
| 129 |
+
# Enable model parallelism
|
| 130 |
+
shift_labels = shift_labels.to(shift_logits.device)
|
| 131 |
+
loss = loss_fct(shift_logits, shift_labels)
|
| 132 |
+
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
# image generation loss
|
| 136 |
+
img_loss = 0
|
| 137 |
+
if gen_image is not None:
|
| 138 |
+
img_hidden_states = []
|
| 139 |
+
for b in range(hidden_states.shape[0]):
|
| 140 |
+
if not i_s_pos[b] == -1:
|
| 141 |
+
img_hidden_states.append(hidden_states[b,i_s_pos[b]:i_s_pos[b]+64, :])
|
| 142 |
+
|
| 143 |
+
img_hidden_states = torch.stack(img_hidden_states, dim=0)
|
| 144 |
+
img_hidden_states = self.get_model().down_projector(img_hidden_states)
|
| 145 |
+
bsz = latents.shape[0]
|
| 146 |
+
dtype = latents.dtype
|
| 147 |
+
noise = torch.randn_like(latents, device=latents.device)
|
| 148 |
+
u = torch.rand(size=(bsz,), device="cpu")
|
| 149 |
+
indices = (u * self.get_model().noise_scheduler.config.num_train_timesteps).long()
|
| 150 |
+
timesteps = self.get_model().noise_scheduler.timesteps[indices].to(device=latents.device)
|
| 151 |
+
sigmas = self.get_sigmas(timesteps, latents.device, n_dim=latents.ndim, dtype=dtype)
|
| 152 |
+
noisy_latents = (1.0 - sigmas) * latents + sigmas * noise
|
| 153 |
+
noise_pred = self.get_model().dit(
|
| 154 |
+
x=noisy_latents,
|
| 155 |
+
timestep=timesteps,
|
| 156 |
+
z_latents=self.mask_drop(img_hidden_states),
|
| 157 |
+
)
|
| 158 |
+
target = noise - latents
|
| 159 |
+
img_loss = F.mse_loss(noise_pred.float(), target.float(), reduction="mean")
|
| 160 |
+
|
| 161 |
+
|
| 162 |
+
|
| 163 |
+
|
| 164 |
+
|
| 165 |
+
print(f"img loss {img_loss}, text loss {loss}")
|
| 166 |
+
total_loss = img_loss + loss
|
| 167 |
+
|
| 168 |
+
|
| 169 |
+
|
| 170 |
+
return CausalLMOutputWithPast(
|
| 171 |
+
loss=total_loss,
|
| 172 |
+
logits=logits,
|
| 173 |
+
past_key_values=outputs.past_key_values,
|
| 174 |
+
hidden_states=outputs.hidden_states,
|
| 175 |
+
attentions=outputs.attentions,
|
| 176 |
+
)
|
| 177 |
+
|
| 178 |
+
|
| 179 |
+
@torch.no_grad()
|
| 180 |
+
def generate(
|
| 181 |
+
self,
|
| 182 |
+
inputs: Optional[torch.Tensor] = None,
|
| 183 |
+
images: Optional[torch.Tensor] = None,
|
| 184 |
+
image_sizes: Optional[torch.Tensor] = None,
|
| 185 |
+
**kwargs,
|
| 186 |
+
) -> Union[GenerateOutput, torch.LongTensor]:
|
| 187 |
+
position_ids = kwargs.pop("position_ids", None)
|
| 188 |
+
attention_mask = kwargs.pop("attention_mask", None)
|
| 189 |
+
if "inputs_embeds" in kwargs:
|
| 190 |
+
raise NotImplementedError("`inputs_embeds` is not supported")
|
| 191 |
+
|
| 192 |
+
if images is not None:
|
| 193 |
+
(
|
| 194 |
+
inputs,
|
| 195 |
+
position_ids,
|
| 196 |
+
attention_mask,
|
| 197 |
+
_,
|
| 198 |
+
inputs_embeds,
|
| 199 |
+
img_indicator,
|
| 200 |
+
_
|
| 201 |
+
) = self.prepare_inputs_labels_for_understanding(
|
| 202 |
+
inputs,
|
| 203 |
+
position_ids,
|
| 204 |
+
attention_mask,
|
| 205 |
+
None,
|
| 206 |
+
None,
|
| 207 |
+
images,
|
| 208 |
+
image_sizes=image_sizes
|
| 209 |
+
)
|
| 210 |
+
else:
|
| 211 |
+
inputs_embeds = self.get_model().embed_tokens(inputs)
|
| 212 |
+
|
| 213 |
+
return super().generate(
|
| 214 |
+
position_ids=position_ids,
|
| 215 |
+
attention_mask=attention_mask,
|
| 216 |
+
inputs_embeds=inputs_embeds,
|
| 217 |
+
**kwargs
|
| 218 |
+
)
|
| 219 |
+
|
| 220 |
+
@torch.no_grad()
|
| 221 |
+
def generate_image(
|
| 222 |
+
self,
|
| 223 |
+
text: List[str],
|
| 224 |
+
tokenizer: AutoTokenizer,
|
| 225 |
+
pixel_values: Optional[torch.Tensor] = None,
|
| 226 |
+
image_grid_thw: Optional[torch.Tensor] = None,
|
| 227 |
+
max_var: Optional[float] = None,
|
| 228 |
+
# placeholder: str = DEFAULT_IMG_PLACEHOLDER,
|
| 229 |
+
):
|
| 230 |
+
scheduler = FlowMatchEulerDiscreteScheduler.from_pretrained("Alpha-VLLM/Lumina-Next-SFT-diffusers", subfolder="scheduler")
|
| 231 |
+
|
| 232 |
+
|
| 233 |
+
N_QUERY = self.get_n_query()
|
| 234 |
+
inputs = tokenizer(text, padding="longest", return_tensors="pt")
|
| 235 |
+
device = self.get_model().device
|
| 236 |
+
attention_mask = inputs.attention_mask.to(device)
|
| 237 |
+
input_ids = inputs.input_ids.to(device) # B x N
|
| 238 |
+
input_ids = torch.cat([input_ids, torch.tensor([[DEFAULT_IM_START_TOKEN_IDX]]).to(device)], dim=1)
|
| 239 |
+
# breakpoint()
|
| 240 |
+
|
| 241 |
+
|
| 242 |
+
text_embeds = self.get_model().embed_tokens(input_ids)
|
| 243 |
+
latent_queries = self.get_model().latent_queries.repeat(text_embeds.shape[0], 1, 1)
|
| 244 |
+
|
| 245 |
+
|
| 246 |
+
if pixel_values is not None:
|
| 247 |
+
und_image_idx = (input_ids == UND_IMAGE_TOKEN_IDX)
|
| 248 |
+
pixel_values = pixel_values.type(self.visual.dtype)
|
| 249 |
+
und_image_embeds = self.visual(pixel_values, grid_thw=image_grid_thw)
|
| 250 |
+
text_embeds[und_image_idx] = und_image_embeds.to(text_embeds.device)[:und_image_idx.sum(), :]
|
| 251 |
+
|
| 252 |
+
|
| 253 |
+
text_embeds = torch.cat([text_embeds, latent_queries], dim=1)
|
| 254 |
+
attention_mask = torch.cat([attention_mask, torch.ones_like(latent_queries[:, :, 0])], dim=1)
|
| 255 |
+
|
| 256 |
+
|
| 257 |
+
outputs = self.model(
|
| 258 |
+
inputs_embeds=text_embeds,
|
| 259 |
+
attention_mask=attention_mask,
|
| 260 |
+
output_hidden_states=True,
|
| 261 |
+
return_dict=True,
|
| 262 |
+
)
|
| 263 |
+
hidden_states = outputs.hidden_states[-1][:,-N_QUERY:,:]
|
| 264 |
+
img_hidden_states = hidden_states
|
| 265 |
+
output_img = self.sample_images(img_hidden_states, scheduler)
|
| 266 |
+
output_img = output_img.view(1, 1792, -1).permute(0,2,1).contiguous()
|
| 267 |
+
|
| 268 |
+
return output_img
|
| 269 |
+
|
| 270 |
+
def sample_images(
|
| 271 |
+
self,
|
| 272 |
+
img_hidden_states,
|
| 273 |
+
scheduler,
|
| 274 |
+
guidance_scale: float = 3.0,
|
| 275 |
+
generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
|
| 276 |
+
num_inference_steps: int = 30,
|
| 277 |
+
num_images_per_prompt: int = 1,
|
| 278 |
+
return_tensor=False,
|
| 279 |
+
**kwargs,
|
| 280 |
+
):
|
| 281 |
+
|
| 282 |
+
device = img_hidden_states.device
|
| 283 |
+
dtype = img_hidden_states.dtype
|
| 284 |
+
|
| 285 |
+
|
| 286 |
+
img_hidden_states_null = torch.zeros_like(img_hidden_states, device=device, dtype=dtype)
|
| 287 |
+
img_hidden_states_input = torch.cat([img_hidden_states_null, img_hidden_states], 0)
|
| 288 |
+
|
| 289 |
+
batch_size = img_hidden_states.shape[0]
|
| 290 |
+
latent_size = self.get_model().dit.config.input_size
|
| 291 |
+
latent_channels = self.get_model().dit.config.in_channels
|
| 292 |
+
|
| 293 |
+
latents = randn_tensor(
|
| 294 |
+
shape=(batch_size * num_images_per_prompt, latent_channels, latent_size, latent_size),
|
| 295 |
+
generator=generator,
|
| 296 |
+
device=device,
|
| 297 |
+
dtype=dtype,
|
| 298 |
+
)
|
| 299 |
+
|
| 300 |
+
# set step values
|
| 301 |
+
sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps)
|
| 302 |
+
scheduler.set_timesteps(num_inference_steps, sigmas=sigmas)
|
| 303 |
+
|
| 304 |
+
# Repeat z_latents and conditions for each image per prompt
|
| 305 |
+
img_hidden_states_input = img_hidden_states_input.repeat_interleave(num_images_per_prompt, dim=0)
|
| 306 |
+
|
| 307 |
+
for t in scheduler.timesteps:
|
| 308 |
+
latent_model_input = latents.repeat(2, 1, 1, 1)
|
| 309 |
+
if hasattr(scheduler, "scale_model_input"):
|
| 310 |
+
latent_model_input = scheduler.scale_model_input(latent_model_input, t)
|
| 311 |
+
|
| 312 |
+
# predict noise model_output
|
| 313 |
+
noise_pred = self.get_model().dit(
|
| 314 |
+
x=latent_model_input,
|
| 315 |
+
timestep=t.unsqueeze(0).expand(latent_model_input.shape[0]).to(latent_model_input.device, torch.long),
|
| 316 |
+
z_latents=img_hidden_states_input,
|
| 317 |
+
)
|
| 318 |
+
|
| 319 |
+
# perform guidance
|
| 320 |
+
noise_pred_uncond, noise_pred = noise_pred.chunk(2)
|
| 321 |
+
noise_pred = noise_pred_uncond + guidance_scale * (noise_pred - noise_pred_uncond)
|
| 322 |
+
|
| 323 |
+
# compute previous image: x_t -> x_t-1
|
| 324 |
+
latents = scheduler.step(noise_pred, t, latents).prev_sample
|
| 325 |
+
|
| 326 |
+
# samples = self.decode_latents(latents, return_tensor=return_tensor)
|
| 327 |
+
return latents
|
| 328 |
+
|
| 329 |
+
def decode_latents(self, latents, normalize=True, return_tensor=False):
|
| 330 |
+
if isinstance(self.get_model().vae, AutoencoderKL):
|
| 331 |
+
latents = latents / self.get_model().vae.config.scaling_factor
|
| 332 |
+
if self.get_model().vae.config.shift_factor is not None:
|
| 333 |
+
latents = latents + self.get_model().vae.config.shift_factor
|
| 334 |
+
latents = latents.to(dtype=torch.float32)
|
| 335 |
+
samples = self.get_model().vae.decode(latents).sample
|
| 336 |
+
else:
|
| 337 |
+
samples = self.get_model().vae.decode(latents)
|
| 338 |
+
if normalize:
|
| 339 |
+
samples = (samples / 2 + 0.5).clamp(0, 1)
|
| 340 |
+
else:
|
| 341 |
+
samples = samples.clamp(-1, 1)
|
| 342 |
+
if return_tensor:
|
| 343 |
+
return samples
|
| 344 |
+
samples = samples.cpu().permute(0, 2, 3, 1).float().numpy()
|
| 345 |
+
samples = numpy_to_pil(samples)
|
| 346 |
+
return samples
|
| 347 |
+
|
| 348 |
+
def prepare_and_encode_inputs(
|
| 349 |
+
self,
|
| 350 |
+
inputs: List[str | Image.Image],
|
| 351 |
+
tokenizer: AutoTokenizer,
|
| 352 |
+
do_classifier_free_guidance: bool = False,
|
| 353 |
+
):
|
| 354 |
+
# pdb.set_trace()
|
| 355 |
+
device = self.get_model().device
|
| 356 |
+
dtype = self.get_model().dtype
|
| 357 |
+
|
| 358 |
+
has_image, has_text = False, False
|
| 359 |
+
text_prompt, image_prompt = "", []
|
| 360 |
+
img_processor = self.get_vision_tower().image_processor
|
| 361 |
+
negative_prompt = {}
|
| 362 |
+
|
| 363 |
+
for x in inputs:
|
| 364 |
+
if isinstance(x, str):
|
| 365 |
+
has_text = True
|
| 366 |
+
text_prompt += x
|
| 367 |
+
else:
|
| 368 |
+
has_image = True
|
| 369 |
+
text_prompt += DEFAULT_IMAGE_TOKEN
|
| 370 |
+
image_prompt.append(img_processor.preprocess(x, return_tensors='pt')['pixel_values'])
|
| 371 |
+
# pdb.set_trace()
|
| 372 |
+
if len(image_prompt) == 0:
|
| 373 |
+
image_prompt = None
|
| 374 |
+
else:
|
| 375 |
+
image_prompt = torch.cat(image_prompt)
|
| 376 |
+
image_prompt = image_prompt.type(dtype).to(device)
|
| 377 |
+
|
| 378 |
+
if has_image and not has_text:
|
| 379 |
+
prompt = self.encode_images(image_prompt)
|
| 380 |
+
# pdb.set_trace()
|
| 381 |
+
if do_classifier_free_guidance:
|
| 382 |
+
key = "[NULL_IMAGE]"
|
| 383 |
+
if key not in negative_prompt:
|
| 384 |
+
negative_image = torch.zeros_like(image_prompt)
|
| 385 |
+
negative_prompt[key] = self.encode_images(negative_image)
|
| 386 |
+
prompt = torch.cat([prompt, negative_prompt[key]], dim=0)
|
| 387 |
+
else:
|
| 388 |
+
prompt = self.generate_image(text=[text_prompt], image=image_prompt, tokenizer=tokenizer)
|
| 389 |
+
if do_classifier_free_guidance:
|
| 390 |
+
key = ""
|
| 391 |
+
if key not in negative_prompt:
|
| 392 |
+
negative_prompt[key] = self.generate_image(text=[""], tokenizer=tokenizer)
|
| 393 |
+
prompt = torch.cat([prompt, negative_prompt[key]], dim=0)
|
| 394 |
+
|
| 395 |
+
gen_pooling = self.get_gen_pooling()
|
| 396 |
+
n_query = self.get_n_query()
|
| 397 |
+
num_img, _, c = prompt.shape
|
| 398 |
+
if 'pool2d' in gen_pooling and has_text and not 'early' in gen_pooling:
|
| 399 |
+
stride = int(gen_pooling.split('_')[1])
|
| 400 |
+
sqrt_n = int(n_query**0.5)
|
| 401 |
+
prompt = prompt.permute(0, 2, 1).reshape(num_img, -1, sqrt_n, sqrt_n)
|
| 402 |
+
prompt = F.avg_pool2d(prompt, kernel_size=(stride, stride), stride=stride)
|
| 403 |
+
prompt = prompt.reshape(num_img, c, -1).permute(0,2,1)
|
| 404 |
+
return prompt
|
| 405 |
+
|
| 406 |
+
|
| 407 |
+
def prepare_inputs_for_generation(self, input_ids, past_key_values=None,
|
| 408 |
+
inputs_embeds=None, **kwargs):
|
| 409 |
+
images = kwargs.pop("images", None)
|
| 410 |
+
image_sizes = kwargs.pop("image_sizes", None)
|
| 411 |
+
inputs = super().prepare_inputs_for_generation(
|
| 412 |
+
input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs
|
| 413 |
+
)
|
| 414 |
+
if images is not None:
|
| 415 |
+
inputs['images'] = images
|
| 416 |
+
if image_sizes is not None:
|
| 417 |
+
inputs['image_sizes'] = image_sizes
|
| 418 |
+
return inputs
|
| 419 |
+
|
| 420 |
+
AutoConfig.register("blip3o_qwen", blip3oQwenConfig)
|
| 421 |
+
AutoModelForCausalLM.register(blip3oQwenConfig, blip3oQwenForCausalLM)
|
UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/language_model/blip3o_qwen_inference.py
ADDED
|
@@ -0,0 +1,418 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import List, Optional, Tuple, Union, Dict
|
| 2 |
+
import torch
|
| 3 |
+
import torch.nn as nn
|
| 4 |
+
from PIL import Image
|
| 5 |
+
import torch.nn.functional as F
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
import transformers
|
| 9 |
+
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
|
| 10 |
+
|
| 11 |
+
from transformers.modeling_outputs import CausalLMOutputWithPast
|
| 12 |
+
from transformers.generation.utils import GenerateOutput
|
| 13 |
+
|
| 14 |
+
from blip3o.model.blip3o_arch import blip3oMetaModel, blip3oMetaForCausalLM
|
| 15 |
+
|
| 16 |
+
from transformers import Qwen3Config, Qwen3Model, Qwen3ForCausalLM
|
| 17 |
+
|
| 18 |
+
from blip3o.constants import UND_IMAGE_TOKEN_IDX, DEFAULT_IM_START_TOKEN_IDX
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
from diffusers.utils.torch_utils import randn_tensor
|
| 23 |
+
from diffusers.pipelines.pipeline_utils import numpy_to_pil
|
| 24 |
+
import numpy as np
|
| 25 |
+
from diffusers.models import AutoencoderKL
|
| 26 |
+
from diffusers.schedulers import FlowMatchEulerDiscreteScheduler
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
class blip3oQwenConfig(Qwen3Config):
|
| 30 |
+
model_type = "blip3o_qwen"
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
class blip3oQwenModel(blip3oMetaModel, Qwen3Model):
|
| 34 |
+
config_class = blip3oQwenConfig
|
| 35 |
+
|
| 36 |
+
def __init__(self, config: Qwen3Config):
|
| 37 |
+
super(blip3oQwenModel, self).__init__(config)
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
class blip3oQwenForInferenceLM(Qwen3ForCausalLM, blip3oMetaForCausalLM):
|
| 41 |
+
config_class = blip3oQwenConfig
|
| 42 |
+
|
| 43 |
+
def __init__(self, config):
|
| 44 |
+
Qwen3ForCausalLM.__init__(self, config)
|
| 45 |
+
config.model_type = "blip3o_qwen"
|
| 46 |
+
|
| 47 |
+
self.model = blip3oQwenModel(config)
|
| 48 |
+
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
|
| 49 |
+
# Initialize weights and apply final processing
|
| 50 |
+
self.post_init()
|
| 51 |
+
|
| 52 |
+
def get_model(self):
|
| 53 |
+
return self.model
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
def forward(
|
| 57 |
+
self,
|
| 58 |
+
input_ids: torch.LongTensor = None,
|
| 59 |
+
attention_mask: Optional[torch.Tensor] = None,
|
| 60 |
+
position_ids: Optional[torch.LongTensor] = None,
|
| 61 |
+
past_key_values: Optional[List[torch.FloatTensor]] = None,
|
| 62 |
+
inputs_embeds: Optional[torch.FloatTensor] = None,
|
| 63 |
+
labels: Optional[torch.LongTensor] = None,
|
| 64 |
+
ids: Optional[list] = None,
|
| 65 |
+
i_s_pos: Optional[list] = None,
|
| 66 |
+
use_cache: Optional[bool] = None,
|
| 67 |
+
output_attentions: Optional[bool] = None,
|
| 68 |
+
output_hidden_states: Optional[bool] = None,
|
| 69 |
+
gen_image: Optional[torch.FloatTensor] = None,
|
| 70 |
+
und_image: Optional[torch.FloatTensor] = None,
|
| 71 |
+
image_sizes: Optional[List[List[int]]] = None,
|
| 72 |
+
return_dict: Optional[bool] = None,
|
| 73 |
+
cache_position: Optional[torch.LongTensor] = None
|
| 74 |
+
) -> Union[Tuple, CausalLMOutputWithPast]:
|
| 75 |
+
|
| 76 |
+
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
| 77 |
+
output_hidden_states = (
|
| 78 |
+
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
| 79 |
+
)
|
| 80 |
+
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
| 81 |
+
|
| 82 |
+
if inputs_embeds is None:
|
| 83 |
+
(
|
| 84 |
+
input_ids,
|
| 85 |
+
position_ids,
|
| 86 |
+
attention_mask,
|
| 87 |
+
past_key_values,
|
| 88 |
+
inputs_embeds,
|
| 89 |
+
labels,
|
| 90 |
+
latents
|
| 91 |
+
) = self.prepare_inputs_labels_for_multimodal(
|
| 92 |
+
input_ids,
|
| 93 |
+
position_ids,
|
| 94 |
+
attention_mask,
|
| 95 |
+
past_key_values,
|
| 96 |
+
labels,
|
| 97 |
+
gen_image,
|
| 98 |
+
und_image,
|
| 99 |
+
i_s_pos,
|
| 100 |
+
image_sizes
|
| 101 |
+
)
|
| 102 |
+
|
| 103 |
+
outputs = self.model(
|
| 104 |
+
input_ids=input_ids,
|
| 105 |
+
attention_mask=attention_mask,
|
| 106 |
+
position_ids=position_ids,
|
| 107 |
+
past_key_values=past_key_values,
|
| 108 |
+
inputs_embeds=inputs_embeds,
|
| 109 |
+
use_cache=use_cache,
|
| 110 |
+
output_attentions=output_attentions,
|
| 111 |
+
output_hidden_states=output_hidden_states,
|
| 112 |
+
return_dict=return_dict,
|
| 113 |
+
)
|
| 114 |
+
|
| 115 |
+
hidden_states = outputs[0]
|
| 116 |
+
logits = self.lm_head(hidden_states)
|
| 117 |
+
logits = logits.float()
|
| 118 |
+
|
| 119 |
+
total_loss = None
|
| 120 |
+
if labels is not None:
|
| 121 |
+
# Shift so that tokens < n predict n
|
| 122 |
+
shift_logits = logits[..., :-1, :].contiguous()
|
| 123 |
+
shift_labels = labels[..., 1:].contiguous()
|
| 124 |
+
# Flatten the tokens
|
| 125 |
+
loss_fct = torch.nn.CrossEntropyLoss()
|
| 126 |
+
shift_logits = shift_logits.view(-1, self.config.vocab_size)
|
| 127 |
+
shift_labels = shift_labels.view(-1)
|
| 128 |
+
# Enable model parallelism
|
| 129 |
+
shift_labels = shift_labels.to(shift_logits.device)
|
| 130 |
+
loss = loss_fct(shift_logits, shift_labels)
|
| 131 |
+
|
| 132 |
+
|
| 133 |
+
# compute image loss
|
| 134 |
+
# target_img_embeds = torch.clone(inputs_embeds.detach())[:,1:,:] # get target image emb
|
| 135 |
+
img_loss_funct = torch.nn.MSELoss()
|
| 136 |
+
# img_hidden_states = self.get_model().down_projector(hidden_states[:,-self.get_n_query():,:])
|
| 137 |
+
img_hidden_states = []
|
| 138 |
+
|
| 139 |
+
for b in range(hidden_states.shape[0]):
|
| 140 |
+
img_hidden_states.append(hidden_states[b,i_s_pos[b]:i_s_pos[b]+64,:])
|
| 141 |
+
img_hidden_states = torch.stack(img_hidden_states,dim=0)
|
| 142 |
+
img_hidden_states = self.get_model().down_projector(img_hidden_states)
|
| 143 |
+
# img_loss = 0.0
|
| 144 |
+
if latents is None:
|
| 145 |
+
img_loss = img_loss_funct(img_hidden_states, torch.clone(img_hidden_states.detach()))
|
| 146 |
+
else:
|
| 147 |
+
bsz = latents.shape[0]
|
| 148 |
+
# device = latents.device
|
| 149 |
+
dtype = latents.dtype
|
| 150 |
+
noise = torch.randn_like(latents, device=latents.device)
|
| 151 |
+
u = torch.rand(size=(bsz,), device="cpu")
|
| 152 |
+
indices = (u * self.get_model().noise_scheduler.config.num_train_timesteps).long()
|
| 153 |
+
timesteps = self.get_model().noise_scheduler.timesteps[indices].to(device=latents.device)
|
| 154 |
+
sigmas = self.get_sigmas(timesteps, latents.device, n_dim=latents.ndim, dtype=dtype)
|
| 155 |
+
noisy_latents = (1.0 - sigmas) * latents + sigmas * noise
|
| 156 |
+
noise_pred = self.get_model().dit(
|
| 157 |
+
x=noisy_latents,
|
| 158 |
+
timestep=timesteps,
|
| 159 |
+
z_latents=self.mask_drop(img_hidden_states),
|
| 160 |
+
)
|
| 161 |
+
target = noise - latents
|
| 162 |
+
img_loss = F.mse_loss(noise_pred.float(), target.float(), reduction="mean")
|
| 163 |
+
print(f"img loss {img_loss}")
|
| 164 |
+
total_loss = img_loss
|
| 165 |
+
|
| 166 |
+
return CausalLMOutputWithPast(
|
| 167 |
+
loss=total_loss,
|
| 168 |
+
logits=logits,
|
| 169 |
+
past_key_values=outputs.past_key_values,
|
| 170 |
+
hidden_states=outputs.hidden_states,
|
| 171 |
+
attentions=outputs.attentions,
|
| 172 |
+
)
|
| 173 |
+
|
| 174 |
+
|
| 175 |
+
@torch.no_grad()
|
| 176 |
+
def generate(
|
| 177 |
+
self,
|
| 178 |
+
inputs: Optional[torch.Tensor] = None,
|
| 179 |
+
images: Optional[torch.Tensor] = None,
|
| 180 |
+
image_sizes: Optional[torch.Tensor] = None,
|
| 181 |
+
**kwargs,
|
| 182 |
+
) -> Union[GenerateOutput, torch.LongTensor]:
|
| 183 |
+
position_ids = kwargs.pop("position_ids", None)
|
| 184 |
+
attention_mask = kwargs.pop("attention_mask", None)
|
| 185 |
+
if "inputs_embeds" in kwargs:
|
| 186 |
+
raise NotImplementedError("`inputs_embeds` is not supported")
|
| 187 |
+
|
| 188 |
+
if images is not None:
|
| 189 |
+
(
|
| 190 |
+
inputs,
|
| 191 |
+
position_ids,
|
| 192 |
+
attention_mask,
|
| 193 |
+
_,
|
| 194 |
+
inputs_embeds,
|
| 195 |
+
img_indicator,
|
| 196 |
+
_
|
| 197 |
+
) = self.prepare_inputs_labels_for_understanding(
|
| 198 |
+
inputs,
|
| 199 |
+
position_ids,
|
| 200 |
+
attention_mask,
|
| 201 |
+
None,
|
| 202 |
+
None,
|
| 203 |
+
images,
|
| 204 |
+
image_sizes=image_sizes
|
| 205 |
+
)
|
| 206 |
+
else:
|
| 207 |
+
inputs_embeds = self.get_model().embed_tokens(inputs)
|
| 208 |
+
|
| 209 |
+
return super().generate(
|
| 210 |
+
position_ids=position_ids,
|
| 211 |
+
attention_mask=attention_mask,
|
| 212 |
+
inputs_embeds=inputs_embeds,
|
| 213 |
+
**kwargs
|
| 214 |
+
)
|
| 215 |
+
|
| 216 |
+
@torch.no_grad()
|
| 217 |
+
def generate_image(
|
| 218 |
+
self,
|
| 219 |
+
text: List[str],
|
| 220 |
+
tokenizer: AutoTokenizer,
|
| 221 |
+
pixel_values: Optional[torch.Tensor] = None,
|
| 222 |
+
image_grid_thw: Optional[torch.Tensor] = None,
|
| 223 |
+
max_var: Optional[float] = None,
|
| 224 |
+
# placeholder: str = DEFAULT_IMG_PLACEHOLDER,
|
| 225 |
+
):
|
| 226 |
+
scheduler = FlowMatchEulerDiscreteScheduler.from_pretrained("Alpha-VLLM/Lumina-Next-SFT-diffusers", subfolder="scheduler")
|
| 227 |
+
|
| 228 |
+
|
| 229 |
+
N_QUERY = self.get_n_query()
|
| 230 |
+
inputs = tokenizer(text, padding="longest", return_tensors="pt")
|
| 231 |
+
device = self.get_model().device
|
| 232 |
+
attention_mask = inputs.attention_mask.to(device)
|
| 233 |
+
input_ids = inputs.input_ids.to(device) # B x N
|
| 234 |
+
input_ids = torch.cat([input_ids, torch.tensor([[DEFAULT_IM_START_TOKEN_IDX]]).to(device)], dim=1)
|
| 235 |
+
# breakpoint()
|
| 236 |
+
|
| 237 |
+
|
| 238 |
+
text_embeds = self.get_model().embed_tokens(input_ids)
|
| 239 |
+
latent_queries = self.get_model().latent_queries.repeat(text_embeds.shape[0], 1, 1)
|
| 240 |
+
|
| 241 |
+
|
| 242 |
+
if pixel_values is not None:
|
| 243 |
+
und_image_idx = (input_ids == UND_IMAGE_TOKEN_IDX)
|
| 244 |
+
pixel_values = pixel_values.type(self.visual.dtype)
|
| 245 |
+
und_image_embeds = self.visual(pixel_values, grid_thw=image_grid_thw)
|
| 246 |
+
text_embeds[und_image_idx] = und_image_embeds.to(text_embeds.device)[:und_image_idx.sum(), :]
|
| 247 |
+
|
| 248 |
+
|
| 249 |
+
text_embeds = torch.cat([text_embeds, latent_queries], dim=1)
|
| 250 |
+
attention_mask = torch.cat([attention_mask, torch.ones_like(latent_queries[:, :, 0])], dim=1)
|
| 251 |
+
|
| 252 |
+
|
| 253 |
+
outputs = self.model(
|
| 254 |
+
inputs_embeds=text_embeds,
|
| 255 |
+
attention_mask=attention_mask,
|
| 256 |
+
output_hidden_states=True,
|
| 257 |
+
return_dict=True,
|
| 258 |
+
)
|
| 259 |
+
hidden_states = outputs.hidden_states[-1][:,-N_QUERY:,:]
|
| 260 |
+
img_hidden_states = hidden_states
|
| 261 |
+
output_img = self.sample_images(img_hidden_states, scheduler)
|
| 262 |
+
output_img = output_img.view(1, 1792, -1).permute(0,2,1).contiguous()
|
| 263 |
+
|
| 264 |
+
return output_img
|
| 265 |
+
|
| 266 |
+
def sample_images(
|
| 267 |
+
self,
|
| 268 |
+
img_hidden_states,
|
| 269 |
+
scheduler,
|
| 270 |
+
guidance_scale: float = 3.0,
|
| 271 |
+
generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
|
| 272 |
+
num_inference_steps: int = 30,
|
| 273 |
+
num_images_per_prompt: int = 1,
|
| 274 |
+
return_tensor=False,
|
| 275 |
+
**kwargs,
|
| 276 |
+
):
|
| 277 |
+
|
| 278 |
+
device = img_hidden_states.device
|
| 279 |
+
dtype = img_hidden_states.dtype
|
| 280 |
+
|
| 281 |
+
|
| 282 |
+
img_hidden_states_null = torch.zeros_like(img_hidden_states, device=device, dtype=dtype)
|
| 283 |
+
img_hidden_states_input = torch.cat([img_hidden_states_null, img_hidden_states], 0)
|
| 284 |
+
|
| 285 |
+
batch_size = img_hidden_states.shape[0]
|
| 286 |
+
latent_size = self.get_model().dit.config.input_size
|
| 287 |
+
latent_channels = self.get_model().dit.config.in_channels
|
| 288 |
+
|
| 289 |
+
latents = randn_tensor(
|
| 290 |
+
shape=(batch_size * num_images_per_prompt, latent_channels, latent_size, latent_size),
|
| 291 |
+
generator=generator,
|
| 292 |
+
device=device,
|
| 293 |
+
dtype=dtype,
|
| 294 |
+
)
|
| 295 |
+
|
| 296 |
+
# set step values
|
| 297 |
+
sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps)
|
| 298 |
+
scheduler.set_timesteps(num_inference_steps, sigmas=sigmas)
|
| 299 |
+
|
| 300 |
+
# Repeat z_latents and conditions for each image per prompt
|
| 301 |
+
img_hidden_states_input = img_hidden_states_input.repeat_interleave(num_images_per_prompt, dim=0)
|
| 302 |
+
|
| 303 |
+
for t in scheduler.timesteps:
|
| 304 |
+
latent_model_input = latents.repeat(2, 1, 1, 1)
|
| 305 |
+
if hasattr(scheduler, "scale_model_input"):
|
| 306 |
+
latent_model_input = scheduler.scale_model_input(latent_model_input, t)
|
| 307 |
+
|
| 308 |
+
# predict noise model_output
|
| 309 |
+
noise_pred = self.get_model().dit(
|
| 310 |
+
x=latent_model_input,
|
| 311 |
+
timestep=t.unsqueeze(0).expand(latent_model_input.shape[0]).to(latent_model_input.device, torch.long),
|
| 312 |
+
z_latents=img_hidden_states_input,
|
| 313 |
+
)
|
| 314 |
+
|
| 315 |
+
# perform guidance
|
| 316 |
+
noise_pred_uncond, noise_pred = noise_pred.chunk(2)
|
| 317 |
+
noise_pred = noise_pred_uncond + guidance_scale * (noise_pred - noise_pred_uncond)
|
| 318 |
+
|
| 319 |
+
# compute previous image: x_t -> x_t-1
|
| 320 |
+
latents = scheduler.step(noise_pred, t, latents).prev_sample
|
| 321 |
+
|
| 322 |
+
# samples = self.decode_latents(latents, return_tensor=return_tensor)
|
| 323 |
+
# breakpoint()
|
| 324 |
+
return latents
|
| 325 |
+
|
| 326 |
+
def decode_latents(self, latents, normalize=True, return_tensor=False):
|
| 327 |
+
if isinstance(self.get_model().vae, AutoencoderKL):
|
| 328 |
+
latents = latents / self.get_model().vae.config.scaling_factor
|
| 329 |
+
if self.get_model().vae.config.shift_factor is not None:
|
| 330 |
+
latents = latents + self.get_model().vae.config.shift_factor
|
| 331 |
+
latents = latents.to(dtype=torch.float32)
|
| 332 |
+
samples = self.get_model().vae.decode(latents).sample
|
| 333 |
+
else:
|
| 334 |
+
samples = self.get_model().vae.decode(latents)
|
| 335 |
+
if normalize:
|
| 336 |
+
samples = (samples / 2 + 0.5).clamp(0, 1)
|
| 337 |
+
else:
|
| 338 |
+
samples = samples.clamp(-1, 1)
|
| 339 |
+
if return_tensor:
|
| 340 |
+
return samples
|
| 341 |
+
samples = samples.cpu().permute(0, 2, 3, 1).float().numpy()
|
| 342 |
+
samples = numpy_to_pil(samples)
|
| 343 |
+
return samples
|
| 344 |
+
|
| 345 |
+
def prepare_and_encode_inputs(
|
| 346 |
+
self,
|
| 347 |
+
inputs: List[str | Image.Image],
|
| 348 |
+
tokenizer: AutoTokenizer,
|
| 349 |
+
do_classifier_free_guidance: bool = False,
|
| 350 |
+
):
|
| 351 |
+
# pdb.set_trace()
|
| 352 |
+
device = self.get_model().device
|
| 353 |
+
dtype = self.get_model().dtype
|
| 354 |
+
|
| 355 |
+
has_image, has_text = False, False
|
| 356 |
+
text_prompt, image_prompt = "", []
|
| 357 |
+
img_processor = self.get_vision_tower().image_processor
|
| 358 |
+
negative_prompt = {}
|
| 359 |
+
|
| 360 |
+
for x in inputs:
|
| 361 |
+
if isinstance(x, str):
|
| 362 |
+
has_text = True
|
| 363 |
+
text_prompt += x
|
| 364 |
+
else:
|
| 365 |
+
has_image = True
|
| 366 |
+
text_prompt += DEFAULT_IMAGE_TOKEN
|
| 367 |
+
image_prompt.append(img_processor.preprocess(x, return_tensors='pt')['pixel_values'])
|
| 368 |
+
# pdb.set_trace()
|
| 369 |
+
if len(image_prompt) == 0:
|
| 370 |
+
image_prompt = None
|
| 371 |
+
else:
|
| 372 |
+
image_prompt = torch.cat(image_prompt)
|
| 373 |
+
image_prompt = image_prompt.type(dtype).to(device)
|
| 374 |
+
|
| 375 |
+
if has_image and not has_text:
|
| 376 |
+
prompt = self.encode_images(image_prompt)
|
| 377 |
+
# pdb.set_trace()
|
| 378 |
+
if do_classifier_free_guidance:
|
| 379 |
+
key = "[NULL_IMAGE]"
|
| 380 |
+
if key not in negative_prompt:
|
| 381 |
+
negative_image = torch.zeros_like(image_prompt)
|
| 382 |
+
negative_prompt[key] = self.encode_images(negative_image)
|
| 383 |
+
prompt = torch.cat([prompt, negative_prompt[key]], dim=0)
|
| 384 |
+
else:
|
| 385 |
+
prompt = self.generate_image(text=[text_prompt], image=image_prompt, tokenizer=tokenizer)
|
| 386 |
+
if do_classifier_free_guidance:
|
| 387 |
+
key = ""
|
| 388 |
+
if key not in negative_prompt:
|
| 389 |
+
negative_prompt[key] = self.generate_image(text=[""], tokenizer=tokenizer)
|
| 390 |
+
prompt = torch.cat([prompt, negative_prompt[key]], dim=0)
|
| 391 |
+
|
| 392 |
+
gen_pooling = self.get_gen_pooling()
|
| 393 |
+
n_query = self.get_n_query()
|
| 394 |
+
num_img, _, c = prompt.shape
|
| 395 |
+
if 'pool2d' in gen_pooling and has_text and not 'early' in gen_pooling:
|
| 396 |
+
stride = int(gen_pooling.split('_')[1])
|
| 397 |
+
sqrt_n = int(n_query**0.5)
|
| 398 |
+
prompt = prompt.permute(0, 2, 1).reshape(num_img, -1, sqrt_n, sqrt_n)
|
| 399 |
+
prompt = F.avg_pool2d(prompt, kernel_size=(stride, stride), stride=stride)
|
| 400 |
+
prompt = prompt.reshape(num_img, c, -1).permute(0,2,1)
|
| 401 |
+
return prompt
|
| 402 |
+
|
| 403 |
+
|
| 404 |
+
def prepare_inputs_for_generation(self, input_ids, past_key_values=None,
|
| 405 |
+
inputs_embeds=None, **kwargs):
|
| 406 |
+
images = kwargs.pop("images", None)
|
| 407 |
+
image_sizes = kwargs.pop("image_sizes", None)
|
| 408 |
+
inputs = super().prepare_inputs_for_generation(
|
| 409 |
+
input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs
|
| 410 |
+
)
|
| 411 |
+
if images is not None:
|
| 412 |
+
inputs['images'] = images
|
| 413 |
+
if image_sizes is not None:
|
| 414 |
+
inputs['image_sizes'] = image_sizes
|
| 415 |
+
return inputs
|
| 416 |
+
|
| 417 |
+
AutoConfig.register("blip3o_qwen", blip3oQwenConfig)
|
| 418 |
+
AutoModelForCausalLM.register(blip3oQwenConfig, blip3oQwenForInferenceLM)
|
UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/lumina_nextdit2d.py
ADDED
|
@@ -0,0 +1,365 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright 2024 Alpha-VLLM Authors and The HuggingFace Team. All rights reserved.
|
| 2 |
+
#
|
| 3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 4 |
+
# you may not use this file except in compliance with the License.
|
| 5 |
+
# You may obtain a copy of the License at
|
| 6 |
+
#
|
| 7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 8 |
+
#
|
| 9 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 12 |
+
# See the License for the specific language governing permissions and
|
| 13 |
+
# limitations under the License.
|
| 14 |
+
|
| 15 |
+
from typing import Any, Dict, Optional
|
| 16 |
+
|
| 17 |
+
import torch
|
| 18 |
+
import torch.nn as nn
|
| 19 |
+
from diffusers.configuration_utils import ConfigMixin, register_to_config
|
| 20 |
+
from diffusers.models.attention import LuminaFeedForward
|
| 21 |
+
from diffusers.models.attention_processor import Attention, LuminaAttnProcessor2_0
|
| 22 |
+
from diffusers.models.embeddings import LuminaCombinedTimestepCaptionEmbedding, LuminaPatchEmbed, PixArtAlphaTextProjection
|
| 23 |
+
|
| 24 |
+
from diffusers.models.modeling_outputs import Transformer2DModelOutput
|
| 25 |
+
from diffusers.models.modeling_utils import ModelMixin
|
| 26 |
+
from diffusers.models.normalization import LuminaLayerNormContinuous, LuminaRMSNormZero, RMSNorm
|
| 27 |
+
from diffusers.utils import is_torch_version, logging
|
| 28 |
+
|
| 29 |
+
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
class LuminaNextDiTBlock(nn.Module):
|
| 33 |
+
"""
|
| 34 |
+
A LuminaNextDiTBlock for LuminaNextDiT2DModel.
|
| 35 |
+
|
| 36 |
+
Parameters:
|
| 37 |
+
dim (`int`): Embedding dimension of the input features.
|
| 38 |
+
num_attention_heads (`int`): Number of attention heads.
|
| 39 |
+
num_kv_heads (`int`):
|
| 40 |
+
Number of attention heads in key and value features (if using GQA), or set to None for the same as query.
|
| 41 |
+
multiple_of (`int`): The number of multiple of ffn layer.
|
| 42 |
+
ffn_dim_multiplier (`float`): The multipier factor of ffn layer dimension.
|
| 43 |
+
norm_eps (`float`): The eps for norm layer.
|
| 44 |
+
qk_norm (`bool`): normalization for query and key.
|
| 45 |
+
cross_attention_dim (`int`): Cross attention embedding dimension of the input text prompt hidden_states.
|
| 46 |
+
norm_elementwise_affine (`bool`, *optional*, defaults to True),
|
| 47 |
+
"""
|
| 48 |
+
|
| 49 |
+
def __init__(
|
| 50 |
+
self,
|
| 51 |
+
dim: int,
|
| 52 |
+
num_attention_heads: int,
|
| 53 |
+
num_kv_heads: int,
|
| 54 |
+
multiple_of: int,
|
| 55 |
+
ffn_dim_multiplier: float,
|
| 56 |
+
norm_eps: float,
|
| 57 |
+
qk_norm: bool,
|
| 58 |
+
cross_attention_dim: int,
|
| 59 |
+
norm_elementwise_affine: bool = True,
|
| 60 |
+
) -> None:
|
| 61 |
+
super().__init__()
|
| 62 |
+
self.head_dim = dim // num_attention_heads
|
| 63 |
+
|
| 64 |
+
self.gate = nn.Parameter(torch.zeros([num_attention_heads]))
|
| 65 |
+
|
| 66 |
+
# Self-attention
|
| 67 |
+
self.attn1 = Attention(
|
| 68 |
+
query_dim=dim,
|
| 69 |
+
cross_attention_dim=None,
|
| 70 |
+
dim_head=dim // num_attention_heads,
|
| 71 |
+
qk_norm="layer_norm_across_heads" if qk_norm else None,
|
| 72 |
+
heads=num_attention_heads,
|
| 73 |
+
kv_heads=num_kv_heads,
|
| 74 |
+
eps=1e-5,
|
| 75 |
+
bias=False,
|
| 76 |
+
out_bias=False,
|
| 77 |
+
processor=LuminaAttnProcessor2_0(),
|
| 78 |
+
)
|
| 79 |
+
self.attn1.to_out = nn.Identity()
|
| 80 |
+
|
| 81 |
+
# Cross-attention
|
| 82 |
+
self.attn2 = Attention(
|
| 83 |
+
query_dim=dim,
|
| 84 |
+
cross_attention_dim=cross_attention_dim,
|
| 85 |
+
dim_head=dim // num_attention_heads,
|
| 86 |
+
qk_norm="layer_norm_across_heads" if qk_norm else None,
|
| 87 |
+
heads=num_attention_heads,
|
| 88 |
+
kv_heads=num_kv_heads,
|
| 89 |
+
eps=1e-5,
|
| 90 |
+
bias=False,
|
| 91 |
+
out_bias=False,
|
| 92 |
+
processor=LuminaAttnProcessor2_0(),
|
| 93 |
+
)
|
| 94 |
+
|
| 95 |
+
self.feed_forward = LuminaFeedForward(
|
| 96 |
+
dim=dim,
|
| 97 |
+
inner_dim=4 * dim,
|
| 98 |
+
multiple_of=multiple_of,
|
| 99 |
+
ffn_dim_multiplier=ffn_dim_multiplier,
|
| 100 |
+
)
|
| 101 |
+
|
| 102 |
+
self.norm1 = LuminaRMSNormZero(
|
| 103 |
+
embedding_dim=dim,
|
| 104 |
+
norm_eps=norm_eps,
|
| 105 |
+
norm_elementwise_affine=norm_elementwise_affine,
|
| 106 |
+
)
|
| 107 |
+
self.ffn_norm1 = RMSNorm(dim, eps=norm_eps, elementwise_affine=norm_elementwise_affine)
|
| 108 |
+
|
| 109 |
+
self.norm2 = RMSNorm(dim, eps=norm_eps, elementwise_affine=norm_elementwise_affine)
|
| 110 |
+
self.ffn_norm2 = RMSNorm(dim, eps=norm_eps, elementwise_affine=norm_elementwise_affine)
|
| 111 |
+
|
| 112 |
+
self.norm1_context = RMSNorm(cross_attention_dim, eps=norm_eps, elementwise_affine=norm_elementwise_affine)
|
| 113 |
+
|
| 114 |
+
def forward(
|
| 115 |
+
self,
|
| 116 |
+
hidden_states: torch.Tensor,
|
| 117 |
+
attention_mask: torch.Tensor,
|
| 118 |
+
image_rotary_emb: torch.Tensor,
|
| 119 |
+
encoder_hidden_states: torch.Tensor,
|
| 120 |
+
encoder_mask: torch.Tensor,
|
| 121 |
+
temb: torch.Tensor,
|
| 122 |
+
cross_attention_kwargs: Optional[Dict[str, Any]] = None,
|
| 123 |
+
):
|
| 124 |
+
"""
|
| 125 |
+
Perform a forward pass through the LuminaNextDiTBlock.
|
| 126 |
+
|
| 127 |
+
Parameters:
|
| 128 |
+
hidden_states (`torch.Tensor`): The input of hidden_states for LuminaNextDiTBlock.
|
| 129 |
+
attention_mask (`torch.Tensor): The input of hidden_states corresponse attention mask.
|
| 130 |
+
image_rotary_emb (`torch.Tensor`): Precomputed cosine and sine frequencies.
|
| 131 |
+
encoder_hidden_states: (`torch.Tensor`): The hidden_states of text prompt are processed by Gemma encoder.
|
| 132 |
+
encoder_mask (`torch.Tensor`): The hidden_states of text prompt attention mask.
|
| 133 |
+
temb (`torch.Tensor`): Timestep embedding with text prompt embedding.
|
| 134 |
+
cross_attention_kwargs (`Dict[str, Any]`): kwargs for cross attention.
|
| 135 |
+
"""
|
| 136 |
+
residual = hidden_states
|
| 137 |
+
|
| 138 |
+
# Self-attention
|
| 139 |
+
norm_hidden_states, gate_msa, scale_mlp, gate_mlp = self.norm1(hidden_states, temb)
|
| 140 |
+
self_attn_output = self.attn1(
|
| 141 |
+
hidden_states=norm_hidden_states,
|
| 142 |
+
encoder_hidden_states=norm_hidden_states,
|
| 143 |
+
attention_mask=attention_mask,
|
| 144 |
+
query_rotary_emb=image_rotary_emb,
|
| 145 |
+
key_rotary_emb=image_rotary_emb,
|
| 146 |
+
**cross_attention_kwargs,
|
| 147 |
+
)
|
| 148 |
+
|
| 149 |
+
# Cross-attention
|
| 150 |
+
norm_encoder_hidden_states = self.norm1_context(encoder_hidden_states)
|
| 151 |
+
cross_attn_output = self.attn2(
|
| 152 |
+
hidden_states=norm_hidden_states,
|
| 153 |
+
encoder_hidden_states=norm_encoder_hidden_states,
|
| 154 |
+
attention_mask=encoder_mask,
|
| 155 |
+
query_rotary_emb=image_rotary_emb,
|
| 156 |
+
key_rotary_emb=None,
|
| 157 |
+
**cross_attention_kwargs,
|
| 158 |
+
)
|
| 159 |
+
cross_attn_output = cross_attn_output * self.gate.tanh().view(1, 1, -1, 1)
|
| 160 |
+
mixed_attn_output = self_attn_output + cross_attn_output
|
| 161 |
+
mixed_attn_output = mixed_attn_output.flatten(-2)
|
| 162 |
+
# linear proj
|
| 163 |
+
hidden_states = self.attn2.to_out[0](mixed_attn_output)
|
| 164 |
+
|
| 165 |
+
hidden_states = residual + gate_msa.unsqueeze(1).tanh() * self.norm2(hidden_states)
|
| 166 |
+
|
| 167 |
+
mlp_output = self.feed_forward(self.ffn_norm1(hidden_states) * (1 + scale_mlp.unsqueeze(1)))
|
| 168 |
+
|
| 169 |
+
hidden_states = hidden_states + gate_mlp.unsqueeze(1).tanh() * self.ffn_norm2(mlp_output)
|
| 170 |
+
|
| 171 |
+
return hidden_states
|
| 172 |
+
|
| 173 |
+
|
| 174 |
+
class LuminaNextDiT2DModel(ModelMixin, ConfigMixin):
|
| 175 |
+
"""
|
| 176 |
+
LuminaNextDiT: Diffusion model with a Transformer backbone.
|
| 177 |
+
|
| 178 |
+
Inherit ModelMixin and ConfigMixin to be compatible with the sampler StableDiffusionPipeline of diffusers.
|
| 179 |
+
|
| 180 |
+
Parameters:
|
| 181 |
+
sample_size (`int`): The width of the latent images. This is fixed during training since
|
| 182 |
+
it is used to learn a number of position embeddings.
|
| 183 |
+
patch_size (`int`, *optional*, (`int`, *optional*, defaults to 2):
|
| 184 |
+
The size of each patch in the image. This parameter defines the resolution of patches fed into the model.
|
| 185 |
+
in_channels (`int`, *optional*, defaults to 4):
|
| 186 |
+
The number of input channels for the model. Typically, this matches the number of channels in the input
|
| 187 |
+
images.
|
| 188 |
+
hidden_size (`int`, *optional*, defaults to 4096):
|
| 189 |
+
The dimensionality of the hidden layers in the model. This parameter determines the width of the model's
|
| 190 |
+
hidden representations.
|
| 191 |
+
num_layers (`int`, *optional*, default to 32):
|
| 192 |
+
The number of layers in the model. This defines the depth of the neural network.
|
| 193 |
+
num_attention_heads (`int`, *optional*, defaults to 32):
|
| 194 |
+
The number of attention heads in each attention layer. This parameter specifies how many separate attention
|
| 195 |
+
mechanisms are used.
|
| 196 |
+
num_kv_heads (`int`, *optional*, defaults to 8):
|
| 197 |
+
The number of key-value heads in the attention mechanism, if different from the number of attention heads.
|
| 198 |
+
If None, it defaults to num_attention_heads.
|
| 199 |
+
multiple_of (`int`, *optional*, defaults to 256):
|
| 200 |
+
A factor that the hidden size should be a multiple of. This can help optimize certain hardware
|
| 201 |
+
configurations.
|
| 202 |
+
ffn_dim_multiplier (`float`, *optional*):
|
| 203 |
+
A multiplier for the dimensionality of the feed-forward network. If None, it uses a default value based on
|
| 204 |
+
the model configuration.
|
| 205 |
+
norm_eps (`float`, *optional*, defaults to 1e-5):
|
| 206 |
+
A small value added to the denominator for numerical stability in normalization layers.
|
| 207 |
+
learn_sigma (`bool`, *optional*, defaults to True):
|
| 208 |
+
Whether the model should learn the sigma parameter, which might be related to uncertainty or variance in
|
| 209 |
+
predictions.
|
| 210 |
+
qk_norm (`bool`, *optional*, defaults to True):
|
| 211 |
+
Indicates if the queries and keys in the attention mechanism should be normalized.
|
| 212 |
+
cross_attention_dim (`int`, *optional*, defaults to 2048):
|
| 213 |
+
The dimensionality of the text embeddings. This parameter defines the size of the text representations used
|
| 214 |
+
in the model.
|
| 215 |
+
scaling_factor (`float`, *optional*, defaults to 1.0):
|
| 216 |
+
A scaling factor applied to certain parameters or layers in the model. This can be used for adjusting the
|
| 217 |
+
overall scale of the model's operations.
|
| 218 |
+
"""
|
| 219 |
+
|
| 220 |
+
_supports_gradient_checkpointing = True
|
| 221 |
+
_no_split_modules = ["LuminaNextDiTBlock"]
|
| 222 |
+
|
| 223 |
+
@register_to_config
|
| 224 |
+
def __init__(
|
| 225 |
+
self,
|
| 226 |
+
sample_size: int = 128,
|
| 227 |
+
patch_size: Optional[int] = 2,
|
| 228 |
+
in_channels: Optional[int] = 4,
|
| 229 |
+
hidden_size: Optional[int] = 2304,
|
| 230 |
+
num_layers: Optional[int] = 32, # 32
|
| 231 |
+
num_attention_heads: Optional[int] = 32, # 32
|
| 232 |
+
num_kv_heads: Optional[int] = None,
|
| 233 |
+
multiple_of: Optional[int] = 256,
|
| 234 |
+
ffn_dim_multiplier: Optional[float] = None,
|
| 235 |
+
norm_eps: Optional[float] = 1e-5,
|
| 236 |
+
learn_sigma: Optional[bool] = True,
|
| 237 |
+
qk_norm: Optional[bool] = True,
|
| 238 |
+
cross_attention_dim: Optional[int] = 2048,
|
| 239 |
+
scaling_factor: Optional[float] = 1.0,
|
| 240 |
+
) -> None:
|
| 241 |
+
super().__init__()
|
| 242 |
+
self.sample_size = sample_size
|
| 243 |
+
self.patch_size = patch_size
|
| 244 |
+
self.in_channels = in_channels
|
| 245 |
+
self.out_channels = in_channels * 2 if learn_sigma else in_channels
|
| 246 |
+
self.hidden_size = hidden_size
|
| 247 |
+
self.num_attention_heads = num_attention_heads
|
| 248 |
+
self.head_dim = hidden_size // num_attention_heads
|
| 249 |
+
self.scaling_factor = scaling_factor
|
| 250 |
+
self.gradient_checkpointing = False
|
| 251 |
+
|
| 252 |
+
self.caption_projection = PixArtAlphaTextProjection(in_features=cross_attention_dim, hidden_size=hidden_size)
|
| 253 |
+
self.patch_embedder = LuminaPatchEmbed(patch_size=patch_size, in_channels=in_channels, embed_dim=hidden_size, bias=True)
|
| 254 |
+
|
| 255 |
+
self.time_caption_embed = LuminaCombinedTimestepCaptionEmbedding(hidden_size=min(hidden_size, 1024), cross_attention_dim=hidden_size)
|
| 256 |
+
|
| 257 |
+
self.layers = nn.ModuleList(
|
| 258 |
+
[
|
| 259 |
+
LuminaNextDiTBlock(
|
| 260 |
+
hidden_size,
|
| 261 |
+
num_attention_heads,
|
| 262 |
+
num_kv_heads,
|
| 263 |
+
multiple_of,
|
| 264 |
+
ffn_dim_multiplier,
|
| 265 |
+
norm_eps,
|
| 266 |
+
qk_norm,
|
| 267 |
+
hidden_size,
|
| 268 |
+
)
|
| 269 |
+
for _ in range(num_layers)
|
| 270 |
+
]
|
| 271 |
+
)
|
| 272 |
+
self.norm_out = LuminaLayerNormContinuous(
|
| 273 |
+
embedding_dim=hidden_size,
|
| 274 |
+
conditioning_embedding_dim=min(hidden_size, 1024),
|
| 275 |
+
elementwise_affine=False,
|
| 276 |
+
eps=1e-6,
|
| 277 |
+
bias=True,
|
| 278 |
+
out_dim=patch_size * patch_size * self.out_channels,
|
| 279 |
+
)
|
| 280 |
+
# self.final_layer = LuminaFinalLayer(hidden_size, patch_size, self.out_channels)
|
| 281 |
+
|
| 282 |
+
assert (hidden_size // num_attention_heads) % 4 == 0, "2d rope needs head dim to be divisible by 4"
|
| 283 |
+
|
| 284 |
+
def _set_gradient_checkpointing(self, module, value=False):
|
| 285 |
+
if hasattr(module, "gradient_checkpointing"):
|
| 286 |
+
module.gradient_checkpointing = value
|
| 287 |
+
|
| 288 |
+
def forward(
|
| 289 |
+
self,
|
| 290 |
+
hidden_states: torch.Tensor,
|
| 291 |
+
timestep: torch.Tensor,
|
| 292 |
+
encoder_hidden_states: torch.Tensor,
|
| 293 |
+
encoder_mask: torch.Tensor,
|
| 294 |
+
image_rotary_emb: torch.Tensor,
|
| 295 |
+
cross_attention_kwargs: Dict[str, Any] = None,
|
| 296 |
+
return_dict=True,
|
| 297 |
+
) -> torch.Tensor:
|
| 298 |
+
"""
|
| 299 |
+
Forward pass of LuminaNextDiT.
|
| 300 |
+
|
| 301 |
+
Parameters:
|
| 302 |
+
hidden_states (torch.Tensor): Input tensor of shape (N, C, H, W).
|
| 303 |
+
timestep (torch.Tensor): Tensor of diffusion timesteps of shape (N,).
|
| 304 |
+
encoder_hidden_states (torch.Tensor): Tensor of caption features of shape (N, D).
|
| 305 |
+
encoder_mask (torch.Tensor): Tensor of caption masks of shape (N, L).
|
| 306 |
+
"""
|
| 307 |
+
hidden_states, mask, img_size, image_rotary_emb = self.patch_embedder(hidden_states, image_rotary_emb)
|
| 308 |
+
image_rotary_emb = image_rotary_emb.to(hidden_states.device)
|
| 309 |
+
# breakpoint()
|
| 310 |
+
encoder_hidden_states = self.caption_projection(encoder_hidden_states)
|
| 311 |
+
temb = self.time_caption_embed(timestep, encoder_hidden_states, encoder_mask)
|
| 312 |
+
|
| 313 |
+
encoder_mask = encoder_mask.bool()
|
| 314 |
+
|
| 315 |
+
for layer in self.layers:
|
| 316 |
+
if self.training and self.gradient_checkpointing:
|
| 317 |
+
|
| 318 |
+
def create_custom_forward(module, return_dict=None):
|
| 319 |
+
def custom_forward(*inputs):
|
| 320 |
+
if return_dict is not None:
|
| 321 |
+
return module(*inputs, return_dict=return_dict)
|
| 322 |
+
else:
|
| 323 |
+
return module(*inputs)
|
| 324 |
+
|
| 325 |
+
return custom_forward
|
| 326 |
+
|
| 327 |
+
ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
|
| 328 |
+
hidden_states = torch.utils.checkpoint.checkpoint(
|
| 329 |
+
create_custom_forward(layer),
|
| 330 |
+
hidden_states,
|
| 331 |
+
mask,
|
| 332 |
+
image_rotary_emb,
|
| 333 |
+
encoder_hidden_states,
|
| 334 |
+
encoder_mask,
|
| 335 |
+
temb,
|
| 336 |
+
cross_attention_kwargs,
|
| 337 |
+
**ckpt_kwargs,
|
| 338 |
+
)
|
| 339 |
+
else:
|
| 340 |
+
hidden_states = layer(
|
| 341 |
+
hidden_states,
|
| 342 |
+
mask,
|
| 343 |
+
image_rotary_emb,
|
| 344 |
+
encoder_hidden_states,
|
| 345 |
+
encoder_mask,
|
| 346 |
+
temb=temb,
|
| 347 |
+
cross_attention_kwargs=cross_attention_kwargs,
|
| 348 |
+
)
|
| 349 |
+
|
| 350 |
+
hidden_states = self.norm_out(hidden_states, temb)
|
| 351 |
+
|
| 352 |
+
# unpatchify
|
| 353 |
+
height_tokens = width_tokens = self.patch_size
|
| 354 |
+
height, width = img_size[0]
|
| 355 |
+
batch_size = hidden_states.size(0)
|
| 356 |
+
sequence_length = (height // height_tokens) * (width // width_tokens)
|
| 357 |
+
hidden_states = hidden_states[:, :sequence_length].view(
|
| 358 |
+
batch_size, height // height_tokens, width // width_tokens, height_tokens, width_tokens, self.out_channels
|
| 359 |
+
)
|
| 360 |
+
output = hidden_states.permute(0, 5, 1, 3, 2, 4).flatten(4, 5).flatten(2, 3)
|
| 361 |
+
|
| 362 |
+
if not return_dict:
|
| 363 |
+
return (output,)
|
| 364 |
+
|
| 365 |
+
return Transformer2DModelOutput(sample=output)
|
UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/make_delta.py
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import argparse
|
| 2 |
+
|
| 3 |
+
import torch
|
| 4 |
+
from tqdm import tqdm
|
| 5 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM
|
| 6 |
+
from blip3o.model.utils import auto_upgrade
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def make_delta(base_model_path, target_model_path, delta_path, hub_repo_id):
|
| 10 |
+
print("Loading base model")
|
| 11 |
+
base = AutoModelForCausalLM.from_pretrained(
|
| 12 |
+
base_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
|
| 13 |
+
|
| 14 |
+
print("Loading target model")
|
| 15 |
+
auto_upgrade(target_model_path)
|
| 16 |
+
target = AutoModelForCausalLM.from_pretrained(target_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
|
| 17 |
+
|
| 18 |
+
print("Calculating delta")
|
| 19 |
+
for name, param in tqdm(target.state_dict().items(), desc="Calculating delta"):
|
| 20 |
+
if name not in base.state_dict():
|
| 21 |
+
assert name in ['model.mm_projector.weight', 'model.mm_projector.bias'], f'{name} not in base model'
|
| 22 |
+
continue
|
| 23 |
+
if param.data.shape == base.state_dict()[name].shape:
|
| 24 |
+
param.data -= base.state_dict()[name]
|
| 25 |
+
else:
|
| 26 |
+
assert name in ['model.embed_tokens.weight', 'lm_head.weight'], f'{name} dimension mismatch: {param.data.shape} vs {base.state_dict()[name].shape}'
|
| 27 |
+
bparam = base.state_dict()[name]
|
| 28 |
+
param.data[:bparam.shape[0], :bparam.shape[1]] -= bparam
|
| 29 |
+
|
| 30 |
+
print("Saving delta")
|
| 31 |
+
if hub_repo_id:
|
| 32 |
+
kwargs = {"push_to_hub": True, "repo_id": hub_repo_id}
|
| 33 |
+
else:
|
| 34 |
+
kwargs = {}
|
| 35 |
+
target.save_pretrained(delta_path, **kwargs)
|
| 36 |
+
target_tokenizer = AutoTokenizer.from_pretrained(target_model_path)
|
| 37 |
+
target_tokenizer.save_pretrained(delta_path, **kwargs)
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
if __name__ == "__main__":
|
| 41 |
+
parser = argparse.ArgumentParser()
|
| 42 |
+
parser.add_argument("--base-model-path", type=str, required=True)
|
| 43 |
+
parser.add_argument("--target-model-path", type=str, required=True)
|
| 44 |
+
parser.add_argument("--delta-path", type=str, required=True)
|
| 45 |
+
parser.add_argument("--hub-repo-id", type=str, default=None)
|
| 46 |
+
args = parser.parse_args()
|
| 47 |
+
|
| 48 |
+
make_delta(args.base_model_path, args.target_model_path, args.delta_path, args.hub_repo_id)
|
UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/multimodal_encoder/__pycache__/builder.cpython-311.pyc
ADDED
|
Binary file (3.5 kB). View file
|
|
|
UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/multimodal_encoder/__pycache__/clip_encoder.cpython-311.pyc
ADDED
|
Binary file (11.9 kB). View file
|
|
|
UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/multimodal_encoder/__pycache__/imagebind.cpython-311.pyc
ADDED
|
Binary file (4.86 kB). View file
|
|
|
UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/multimodal_encoder/__pycache__/open_clip_encoder.cpython-311.pyc
ADDED
|
Binary file (11.2 kB). View file
|
|
|
UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/multimodal_encoder/__pycache__/siglip_encoder.cpython-311.pyc
ADDED
|
Binary file (36.9 kB). View file
|
|
|
UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/multimodal_encoder/builder.py
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from .clip_encoder import CLIPVisionTower
|
| 3 |
+
from .imagebind import ImageBindWrapper
|
| 4 |
+
from .open_clip_encoder import OpenCLIPVisionTower
|
| 5 |
+
from .siglip_encoder import SigLipVisionTower
|
| 6 |
+
from .clip_encoder import CLIPVisionTower, CLIPVisionTowerS2
|
| 7 |
+
|
| 8 |
+
from .eva_clip.eva_clip_encoder import EvaClipVisionTower
|
| 9 |
+
from .dev_eva_clip.eva_vit import EvaViTWrapper
|
| 10 |
+
|
| 11 |
+
from blip3o.model.nextdit_crossattn import NextDiTCrossAttnConfig, NextDiTCrossAttn
|
| 12 |
+
from diffusers.models import AutoencoderKL
|
| 13 |
+
from diffusers.schedulers import FlowMatchEulerDiscreteScheduler
|
| 14 |
+
from transformers import AutoModel, AutoProcessor, SiglipVisionModel, AutoConfig
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def build_vision_tower(vision_tower_cfg, **kwargs):
|
| 19 |
+
vision_tower = getattr(vision_tower_cfg, 'mm_vision_tower', getattr(vision_tower_cfg, 'vision_tower', None))
|
| 20 |
+
is_absolute_path_exists = os.path.exists(vision_tower)
|
| 21 |
+
use_s2 = getattr(vision_tower_cfg, 's2', False)
|
| 22 |
+
if "siglip2" in vision_tower:
|
| 23 |
+
return SiglipVisionModel.from_pretrained(vision_tower, attn_implementation="sdpa")
|
| 24 |
+
raise ValueError(f'Unknown vision tower: {vision_tower}')
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def build_gen_vision_tower(vision_tower_cfg, **kwargs):
|
| 30 |
+
vision_tower = getattr(vision_tower_cfg, 'gen_vision_tower')
|
| 31 |
+
is_absolute_path_exists = os.path.exists(vision_tower)
|
| 32 |
+
use_s2 = getattr(vision_tower_cfg, 's2', False)
|
| 33 |
+
if "siglip2" in vision_tower:
|
| 34 |
+
return SiglipVisionModel.from_pretrained(vision_tower, attn_implementation="sdpa")
|
| 35 |
+
if "eva" in vision_tower:
|
| 36 |
+
return EvaClipVisionTower(vision_tower, args=vision_tower_cfg, **kwargs)
|
| 37 |
+
if is_absolute_path_exists or vision_tower.startswith("openai") or vision_tower.startswith("laion") or "ShareGPT4V" in vision_tower:
|
| 38 |
+
if use_s2:
|
| 39 |
+
return CLIPVisionTowerS2(vision_tower, args=vision_tower_cfg, **kwargs)
|
| 40 |
+
else:
|
| 41 |
+
return CLIPVisionTower(vision_tower, args=vision_tower_cfg, **kwargs)
|
| 42 |
+
|
| 43 |
+
raise ValueError(f'Unknown vision tower: {vision_tower}')
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
def build_dit(vision_tower_cfg, **kwargs):
|
| 48 |
+
if not hasattr(vision_tower_cfg, "hidden_size"):
|
| 49 |
+
vision_tower_cfg.hidden_size = AutoConfig.from_pretrained(vision_tower_cfg.model_name_or_path).hidden_size
|
| 50 |
+
|
| 51 |
+
dit = NextDiTCrossAttn(NextDiTCrossAttnConfig(latent_embedding_size=vision_tower_cfg.hidden_size))
|
| 52 |
+
noise_scheduler = FlowMatchEulerDiscreteScheduler.from_pretrained("Alpha-VLLM/Lumina-Next-SFT-diffusers", subfolder="scheduler")
|
| 53 |
+
return dit, noise_scheduler
|
| 54 |
+
|
| 55 |
+
|
UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/multimodal_encoder/clip_encoder.py
ADDED
|
@@ -0,0 +1,172 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
import torch.nn as nn
|
| 3 |
+
from transformers import CLIPVisionModel, CLIPImageProcessor, CLIPVisionConfig
|
| 4 |
+
|
| 5 |
+
try:
|
| 6 |
+
from s2wrapper import forward as multiscale_forward
|
| 7 |
+
except:
|
| 8 |
+
pass
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class CLIPVisionTower(nn.Module):
|
| 12 |
+
def __init__(self, vision_tower, args, delay_load=False):
|
| 13 |
+
super().__init__()
|
| 14 |
+
|
| 15 |
+
self.is_loaded = False
|
| 16 |
+
|
| 17 |
+
self.vision_tower_name = vision_tower
|
| 18 |
+
self.select_layer = args.mm_vision_select_layer
|
| 19 |
+
self.select_feature = getattr(args, "mm_vision_select_feature", "patch")
|
| 20 |
+
|
| 21 |
+
if not delay_load:
|
| 22 |
+
print(f"Loading vision tower: {vision_tower}")
|
| 23 |
+
self.load_model()
|
| 24 |
+
elif getattr(args, "unfreeze_mm_vision_tower", False):
|
| 25 |
+
# TODO: better detector is needed.
|
| 26 |
+
print(f"The checkpoint seems to contain `vision_tower` weights: `unfreeze_mm_vision_tower`: True.")
|
| 27 |
+
self.load_model()
|
| 28 |
+
elif hasattr(args, "mm_tunable_parts") and "mm_vision_tower" in args.mm_tunable_parts:
|
| 29 |
+
print(f"The checkpoint seems to contain `vision_tower` weights: `mm_tunable_parts` contains `mm_vision_tower`.")
|
| 30 |
+
self.load_model()
|
| 31 |
+
else:
|
| 32 |
+
self.cfg_only = CLIPVisionConfig.from_pretrained(self.vision_tower_name)
|
| 33 |
+
|
| 34 |
+
def load_model(self, device_map=None):
|
| 35 |
+
if self.is_loaded:
|
| 36 |
+
print("{} is already loaded, `load_model` called again, skipping.".format(self.vision_tower_name))
|
| 37 |
+
return
|
| 38 |
+
|
| 39 |
+
self.image_processor = CLIPImageProcessor.from_pretrained(self.vision_tower_name)
|
| 40 |
+
self.vision_tower = CLIPVisionModel.from_pretrained(self.vision_tower_name, device_map=device_map)
|
| 41 |
+
self.vision_tower.requires_grad_(False)
|
| 42 |
+
|
| 43 |
+
self.is_loaded = True
|
| 44 |
+
|
| 45 |
+
def feature_select(self, image_forward_outs):
|
| 46 |
+
select_feature_type = self.select_feature
|
| 47 |
+
|
| 48 |
+
if self.select_feature in ["slicefour_patch", "slicefour_cls_patch"]:
|
| 49 |
+
select_every_k_layer = len(image_forward_outs.hidden_states) // 4
|
| 50 |
+
image_features = torch.cat([image_forward_outs.hidden_states[i] for i in range(select_every_k_layer + self.select_layer, len(image_forward_outs.hidden_states), select_every_k_layer)], dim=-1)
|
| 51 |
+
select_feature_type = select_feature_type.replace("slicefour_", "")
|
| 52 |
+
elif self.select_feature in ["slice_m25811_f6_patch", "slice_m25811_f6_cls_patch"]:
|
| 53 |
+
select_layers = [-2, -5, -8, -11, 6]
|
| 54 |
+
image_features = torch.cat([image_forward_outs.hidden_states[i] for i in select_layers], dim=-1)
|
| 55 |
+
select_feature_type = select_feature_type.replace("slice_m25811_f6_", "")
|
| 56 |
+
else:
|
| 57 |
+
image_features = image_forward_outs.hidden_states[self.select_layer]
|
| 58 |
+
|
| 59 |
+
if select_feature_type == "patch":
|
| 60 |
+
image_features = image_features[:, 1:]
|
| 61 |
+
elif select_feature_type == "cls_patch":
|
| 62 |
+
image_features = image_features
|
| 63 |
+
else:
|
| 64 |
+
raise ValueError(f"Unexpected select feature: {select_feature_type}")
|
| 65 |
+
return image_features
|
| 66 |
+
|
| 67 |
+
def forward(self, images):
|
| 68 |
+
if type(images) is list:
|
| 69 |
+
image_features = []
|
| 70 |
+
for image in images:
|
| 71 |
+
image_forward_out = self.vision_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0), output_hidden_states=True)
|
| 72 |
+
image_feature = self.feature_select(image_forward_out).to(image.dtype)
|
| 73 |
+
image_features.append(image_feature)
|
| 74 |
+
else:
|
| 75 |
+
image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype), output_hidden_states=True)
|
| 76 |
+
image_features = self.feature_select(image_forward_outs).to(images.dtype)
|
| 77 |
+
|
| 78 |
+
return image_features
|
| 79 |
+
|
| 80 |
+
@property
|
| 81 |
+
def dummy_feature(self):
|
| 82 |
+
return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
|
| 83 |
+
|
| 84 |
+
@property
|
| 85 |
+
def dtype(self):
|
| 86 |
+
return self.vision_tower.dtype
|
| 87 |
+
|
| 88 |
+
@property
|
| 89 |
+
def device(self):
|
| 90 |
+
return self.vision_tower.device
|
| 91 |
+
|
| 92 |
+
@property
|
| 93 |
+
def config(self):
|
| 94 |
+
if self.is_loaded:
|
| 95 |
+
return self.vision_tower.config
|
| 96 |
+
else:
|
| 97 |
+
return self.cfg_only
|
| 98 |
+
|
| 99 |
+
@property
|
| 100 |
+
def hidden_size(self):
|
| 101 |
+
_hidden_size = self.config.hidden_size
|
| 102 |
+
if "slicefour" in self.select_feature:
|
| 103 |
+
_hidden_size *= 4
|
| 104 |
+
if "slice_m25811_f6" in self.select_feature:
|
| 105 |
+
_hidden_size *= 5
|
| 106 |
+
return _hidden_size
|
| 107 |
+
|
| 108 |
+
@property
|
| 109 |
+
def num_patches_per_side(self):
|
| 110 |
+
return self.config.image_size // self.config.patch_size
|
| 111 |
+
|
| 112 |
+
@property
|
| 113 |
+
def num_patches(self):
|
| 114 |
+
_num_patches = (self.config.image_size // self.config.patch_size) ** 2
|
| 115 |
+
if "cls_patch" in self.select_feature:
|
| 116 |
+
_num_patches += 1
|
| 117 |
+
return _num_patches
|
| 118 |
+
|
| 119 |
+
@property
|
| 120 |
+
def image_size(self):
|
| 121 |
+
return self.config.image_size
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
class CLIPVisionTowerS2(CLIPVisionTower):
|
| 125 |
+
def __init__(self, vision_tower, args, delay_load=False):
|
| 126 |
+
|
| 127 |
+
self.s2_scales = getattr(args, "s2_scales", "336,672,1008")
|
| 128 |
+
self.s2_scales = list(map(int, self.s2_scales.split(",")))
|
| 129 |
+
self.s2_scales.sort()
|
| 130 |
+
self.s2_split_size = self.s2_scales[0]
|
| 131 |
+
self.s2_image_size = self.s2_scales[-1]
|
| 132 |
+
|
| 133 |
+
super().__init__(vision_tower, args, delay_load)
|
| 134 |
+
|
| 135 |
+
# change resize/crop size in preprocessing to the largest image size in s2_scale
|
| 136 |
+
if not delay_load or getattr(args, "unfreeze_mm_vision_tower", False):
|
| 137 |
+
self.image_processor.size["shortest_edge"] = self.s2_image_size
|
| 138 |
+
self.image_processor.crop_size["height"] = self.image_processor.crop_size["width"] = self.s2_image_size
|
| 139 |
+
|
| 140 |
+
def load_model(self, device_map=None):
|
| 141 |
+
if self.is_loaded:
|
| 142 |
+
print("{} is already loaded, `load_model` called again, skipping.".format(self.vision_tower_name))
|
| 143 |
+
return
|
| 144 |
+
|
| 145 |
+
self.image_processor = CLIPImageProcessor.from_pretrained(self.vision_tower_name)
|
| 146 |
+
self.vision_tower = CLIPVisionModel.from_pretrained(self.vision_tower_name, device_map=device_map)
|
| 147 |
+
self.vision_tower.requires_grad_(False)
|
| 148 |
+
|
| 149 |
+
self.image_processor.size["shortest_edge"] = self.s2_image_size
|
| 150 |
+
self.image_processor.crop_size["height"] = self.image_processor.crop_size["width"] = self.s2_image_size
|
| 151 |
+
|
| 152 |
+
self.is_loaded = True
|
| 153 |
+
|
| 154 |
+
def forward_feature(self, images):
|
| 155 |
+
image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype), output_hidden_states=True)
|
| 156 |
+
image_features = self.feature_select(image_forward_outs).to(images.dtype)
|
| 157 |
+
return image_features
|
| 158 |
+
|
| 159 |
+
def forward(self, images):
|
| 160 |
+
if type(images) is list:
|
| 161 |
+
image_features = []
|
| 162 |
+
for image in images:
|
| 163 |
+
image_feature = multiscale_forward(self.forward_feature, image.unsqueeze(0), img_sizes=self.s2_scales, max_split_size=self.s2_split_size, split_forward=True)
|
| 164 |
+
image_features.append(image_feature)
|
| 165 |
+
else:
|
| 166 |
+
image_features = multiscale_forward(self.forward_feature, images, img_sizes=self.s2_scales, max_split_size=self.s2_split_size, split_forward=True)
|
| 167 |
+
|
| 168 |
+
return image_features
|
| 169 |
+
|
| 170 |
+
@property
|
| 171 |
+
def hidden_size(self):
|
| 172 |
+
return self.config.hidden_size * len(self.s2_scales)
|
UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/multimodal_encoder/dev_eva_clip/__pycache__/eva_vit.cpython-311.pyc
ADDED
|
Binary file (8.76 kB). View file
|
|
|
UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/multimodal_encoder/dev_eva_clip/eva_clip/__init__.py
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .constants import OPENAI_DATASET_MEAN, OPENAI_DATASET_STD
|
| 2 |
+
from .factory import create_model, create_model_and_transforms, create_model_from_pretrained, get_tokenizer
|
| 3 |
+
from .factory import list_models, add_model_config, get_model_config, load_checkpoint
|
| 4 |
+
from .loss import ClipLoss
|
| 5 |
+
from .model import CLIP, CustomCLIP, CLIPTextCfg, CLIPVisionCfg, convert_weights_to_lp, convert_weights_to_fp16, trace_model, get_cast_dtype
|
| 6 |
+
from .openai import load_openai_model, list_openai_models
|
| 7 |
+
from .pretrained import list_pretrained, list_pretrained_models_by_tag, list_pretrained_tags_by_model, get_pretrained_url, download_pretrained_from_url, is_pretrained_cfg, get_pretrained_cfg, download_pretrained
|
| 8 |
+
from .tokenizer import SimpleTokenizer, tokenize
|
| 9 |
+
from .transform import image_transform
|
UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/multimodal_encoder/dev_eva_clip/eva_clip/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (1.67 kB). View file
|
|
|
UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/multimodal_encoder/dev_eva_clip/eva_clip/__pycache__/constants.cpython-311.pyc
ADDED
|
Binary file (316 Bytes). View file
|
|
|
UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/multimodal_encoder/dev_eva_clip/eva_clip/__pycache__/eva_vit_model.cpython-311.pyc
ADDED
|
Binary file (34.1 kB). View file
|
|
|
UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/multimodal_encoder/dev_eva_clip/eva_clip/__pycache__/factory.cpython-311.pyc
ADDED
|
Binary file (27.8 kB). View file
|
|
|
UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/multimodal_encoder/dev_eva_clip/eva_clip/__pycache__/hf_configs.cpython-311.pyc
ADDED
|
Binary file (830 Bytes). View file
|
|
|
UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/multimodal_encoder/dev_eva_clip/eva_clip/__pycache__/hf_model.cpython-311.pyc
ADDED
|
Binary file (13.7 kB). View file
|
|
|
UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/multimodal_encoder/dev_eva_clip/eva_clip/__pycache__/loss.cpython-311.pyc
ADDED
|
Binary file (6.78 kB). View file
|
|
|
UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/multimodal_encoder/dev_eva_clip/eva_clip/__pycache__/model.cpython-311.pyc
ADDED
|
Binary file (25.3 kB). View file
|
|
|
UMM/BLIP3o-Qwen3-Siglip2/blip3o/model/multimodal_encoder/dev_eva_clip/eva_clip/__pycache__/modified_resnet.cpython-311.pyc
ADDED
|
Binary file (13.3 kB). View file
|
|
|