| """
|
| registry.py
|
|
|
| Exhaustive list of pretrained VLMs (with full descriptions / links to corresponding names and sections of paper).
|
| """
|
|
|
|
|
|
|
| MODEL_REGISTRY = {
|
|
|
| "reproduction-llava-v15+7b": {
|
| "model_id": "reproduction-llava-v15+7b",
|
| "names": ["LLaVa v1.5 7B (Reproduction)"],
|
| "description": {
|
| "name": "LLaVa v1.5 7B (Reproduction)",
|
| "optimization_procedure": "multi-stage",
|
| "visual_representation": "CLIP ViT-L/14 @ 336px",
|
| "image_processing": "Letterbox",
|
| "language_model": "Vicuña v1.5 7B",
|
| "datasets": ["LLaVa v1.5 Instruct"],
|
| "train_epochs": 1,
|
| }
|
| },
|
| "reproduction-llava-v15+13b": {
|
| "model_id": "reproduction-llava-v15+13b",
|
| "names": ["LLaVa v1.5 13B (Reproduction)"],
|
| "description": {
|
| "name": "LLaVa v1.5 13B (Reproduction)",
|
| "optimization_procedure": "multi-stage",
|
| "visual_representation": "CLIP ViT-L/14 @ 336px",
|
| "image_processing": "Letterbox",
|
| "language_model": "Vicuña v1.5 13B",
|
| "datasets": ["LLaVa v1.5 Instruct"],
|
| "train_epochs": 1,
|
| }
|
| },
|
|
|
|
|
| "one-stage+7b": {
|
| "model_id": "one-stage+7b",
|
| "names": [
|
| "One-Stage 7B",
|
| "Single-Stage 7B",
|
| "Frozen ViT (Single-Stage)",
|
| "CLIP ViT-L 336px (Letterbox)",
|
| "CLIP ViT-L 336px",
|
| "Vicuña v1.5 7B",
|
| "1 Epoch",
|
| "Base",
|
| ],
|
| "description": {
|
| "name": "Single-Stage 7B",
|
| "optimization_procedure": "single-stage",
|
| "visual_representation": "CLIP ViT-L/14 @ 336px",
|
| "image_processing": "Letterbox",
|
| "language_model": "Vicuña v1.5 7B",
|
| "datasets": ["LLaVa v1.5 Instruct"],
|
| "train_epochs": 1,
|
| }
|
| },
|
| "one-stage+13b": {
|
| "model_id": "one-stage+13b",
|
| "names": [
|
| "One-Stage 13B",
|
| "Single-Stage 13B",
|
| "Vicuña v1.5 13B",
|
| ],
|
| "description": {
|
| "name": "Single-Stage 13B",
|
| "optimization_procedure": "single-stage",
|
| "visual_representation": "CLIP ViT-L/14 @ 336px",
|
| "image_processing": "Letterbox",
|
| "language_model": "Vicuña v1.5 13B",
|
| "datasets": ["LLaVa v1.5 Instruct"],
|
| "train_epochs": 1,
|
| }
|
| },
|
|
|
| "full-ft-multi-stage+7b": {
|
| "model_id": "full-ft-multi-stage+7b",
|
| "names": ["Finetune ViT (Multi-Stage)"],
|
| "description": {
|
| "name": "Finetune ViT (Multi-Stage)",
|
| "optimization_procedure": "multi-stage-full-finetune",
|
| "visual_representation": "CLIP ViT-L/14 @ 336px",
|
| "image_processing": "Letterbox",
|
| "language_model": "Vicuña v1.5 7B",
|
| "datasets": ["LLaVa v1.5 Instruct"],
|
| "train_epochs": 1,
|
| }
|
| },
|
| "full-ft-one-stage+7b": {
|
| "model_id": "full-ft-one-stage+7b",
|
| "names": ["Finetune ViT (Single-Stage)"],
|
| "description": {
|
| "name": "Finetune ViT (Single-Stage)",
|
| "optimization_procedure": "single-stage-full-finetune",
|
| "visual_representation": "CLIP ViT-L/14 @ 336px",
|
| "image_processing": "Letterbox",
|
| "language_model": "Vicuña v1.5 7B",
|
| "datasets": ["LLaVa v1.5 Instruct"],
|
| "train_epochs": 1,
|
| }
|
| },
|
|
|
|
|
| "in1k-224px+7b": {
|
| "model_id": "in1k-224px+7b",
|
| "names": ["IN1K ViT-L 224px"],
|
| "description": {
|
| "name": "IN1K ViT-L 224px",
|
| "optimization_procedure": "single-stage",
|
| "visual_representation": "ImageNet-21K+1K ViT-L/16 @ 224px",
|
| "image_processing": "Letterbox",
|
| "language_model": "Vicuña v1.5 7B",
|
| "datasets": ["LLaVa v1.5 Instruct"],
|
| "train_epochs": 1,
|
| },
|
| },
|
| "dinov2-224px+7b": {
|
| "model_id": "dinov2-224px+7b",
|
| "names": ["DINOv2 ViT-L 224px"],
|
| "description": {
|
| "name": "DINOv2 ViT-L 224px",
|
| "optimization_procedure": "single-stage",
|
| "visual_representation": "DINOv2 ViT-L/14 @ 224px",
|
| "image_processing": "Letterbox",
|
| "language_model": "Vicuña v1.5 7B",
|
| "datasets": ["LLaVa v1.5 Instruct"],
|
| "train_epochs": 1,
|
| },
|
| },
|
| "clip-224px+7b": {
|
| "model_id": "clip-224px+7b",
|
| "names": ["CLIP ViT-L 224px"],
|
| "description": {
|
| "name": "CLIP ViT-L 224px",
|
| "optimization_procedure": "single-stage",
|
| "visual_representation": "CLIP ViT-L/14 @ 224px",
|
| "image_processing": "Letterbox",
|
| "language_model": "Vicuña v1.5 7B",
|
| "datasets": ["LLaVa v1.5 Instruct"],
|
| "train_epochs": 1,
|
| },
|
| },
|
| "siglip-224px+7b": {
|
| "model_id": "siglip-224px+7b",
|
| "names": ["SigLIP ViT-SO 224px"],
|
| "description": {
|
| "name": "SigLIP ViT-SO 224px",
|
| "optimization_procedure": "single-stage",
|
| "visual_representation": "SigLIP ViT-SO/14 @ 224px",
|
| "image_processing": "Letterbox",
|
| "language_model": "Vicuña v1.5 7B",
|
| "datasets": ["LLaVa v1.5 Instruct"],
|
| "train_epochs": 1,
|
| },
|
| },
|
|
|
| "clip-336px-resize-crop+7b": {
|
| "model_id": "clip-336px-resize-crop+7b",
|
| "names": ["CLIP ViT-L 336px (Resize Crop)"],
|
| "description": {
|
| "name": "CLIP ViT-L 336px (Resize Crop)",
|
| "optimization_procedure": "single-stage",
|
| "visual_representation": "CLIP ViT-L/14 @ 336px",
|
| "image_processing": "Resize Crop",
|
| "language_model": "Vicuña v1.5 7B",
|
| "datasets": ["LLaVa v1.5 Instruct"],
|
| "train_epochs": 1,
|
| }
|
| },
|
| "clip-336px-resize-naive+7b": {
|
| "model_id": "clip-336px-resize-naive+7b",
|
| "names": ["CLIP ViT-L 336px (Naive Resize)", "CLIP 336px (Naive Resize)"],
|
| "description": {
|
| "name": "CLIP ViT-L 336px (Naive Resize)",
|
| "optimization_procedure": "single-stage",
|
| "visual_representation": "CLIP ViT-L/14 @ 336px",
|
| "image_processing": "Naive Resize",
|
| "language_model": "Vicuña v1.5 7B",
|
| "datasets": ["LLaVa v1.5 Instruct"],
|
| "train_epochs": 1,
|
| }
|
| },
|
| "siglip-384px-letterbox+7b": {
|
| "model_id": "siglip-384px-letterbox+7b",
|
| "names": ["SigLIP ViT-SO 384px (Letterbox)", "SigLIP ViT-SO 384px"],
|
| "description": {
|
| "name": "SigLIP ViT-SO 384px (Letterbox)",
|
| "optimization_procedure": "single-stage",
|
| "visual_representation": "SigLIP ViT-SO/14 @ 384px",
|
| "image_processing": "Letterbox",
|
| "language_model": "Vicuña v1.5 7B",
|
| "datasets": ["LLaVa v1.5 Instruct"],
|
| "train_epochs": 1,
|
| }
|
| },
|
| "siglip-384px-resize-crop+7b": {
|
| "model_id": "siglip-384px-resize-crop+7b",
|
| "names": ["SigLIP ViT-SO 384px (Resize Crop)"],
|
| "description": {
|
| "name": "SigLIP ViT-SO 384px (Resize Crop)",
|
| "optimization_procedure": "single-stage",
|
| "visual_representation": "SigLIP ViT-SO/14 @ 384px",
|
| "image_processing": "Resize Crop",
|
| "language_model": "Vicuña v1.5 7B",
|
| "datasets": ["LLaVa v1.5 Instruct"],
|
| "train_epochs": 1,
|
| }
|
| },
|
| "siglip-384px-resize-naive+7b": {
|
| "model_id": "siglip-384px-resize-naive+7b",
|
| "names": ["SigLIP ViT-SO 384px (Naive Resize)", "SigLIP 384px (Naive Resize)"],
|
| "description": {
|
| "name": "SigLIP ViT-SO 384px (Naive Resize)",
|
| "optimization_procedure": "single-stage",
|
| "visual_representation": "SigLIP ViT-SO/14 @ 384px",
|
| "image_processing": "Naive Resize",
|
| "language_model": "Vicuña v1.5 7B",
|
| "datasets": ["LLaVa v1.5 Instruct"],
|
| "train_epochs": 1,
|
| }
|
| },
|
|
|
| "dinoclip-336px-letterbox+7b": {
|
| "model_id": "dinoclip-336px-letterbox+7b",
|
| "names": ["DINOv2 + CLIP 336px (Letterbox)"],
|
| "description": {
|
| "name": "DINOv2 + CLIP 336px (Letterbox)",
|
| "optimization_procedure": "single-stage",
|
| "visual_representation": "DINOv2 ViT-L/14 + CLIP ViT-L/14 @ 336px",
|
| "image_processing": "Letterbox",
|
| "language_model": "Vicuña v1.5 7B",
|
| "datasets": ["LLaVa v1.5 Instruct"],
|
| "train_epochs": 1,
|
| }
|
| },
|
| "dinoclip-336px-resize-naive+7b": {
|
| "model_id": "dinoclip-336px-resize-naive+7b",
|
| "names": ["DINOv2 + CLIP 336px (Naive Resize)"],
|
| "description": {
|
| "name": "DINOv2 + CLIP 336px (Naive Resize)",
|
| "optimization_procedure": "single-stage",
|
| "visual_representation": "DINOv2 ViT-L/14 + CLIP ViT-L/14 @ 336px",
|
| "image_processing": "Naive Resize",
|
| "language_model": "Vicuña v1.5 7B",
|
| "datasets": ["LLaVa v1.5 Instruct"],
|
| "train_epochs": 1,
|
| }
|
| },
|
| "dinosiglip-384px-letterbox+7b": {
|
| "model_id": "dinosiglip-384px-letterbox+7b",
|
| "names": ["DINOv2 + SigLIP 384px (Letterbox)"],
|
| "description": {
|
| "name": "DINOv2 + SigLIP 384px (Letterbox)",
|
| "optimization_procedure": "single-stage",
|
| "visual_representation": "DINOv2 ViT-L/14 + SigLIP ViT-L/14 @ 384px",
|
| "image_processing": "Letterbox",
|
| "language_model": "Vicuña v1.5 7B",
|
| "datasets": ["LLaVa v1.5 Instruct"],
|
| "train_epochs": 1,
|
| }
|
| },
|
| "dinosiglip-384px-resize-naive+7b": {
|
| "model_id": "dinosiglip-384px-resize-naive+7b",
|
| "names": ["DINOv2 + SigLIP 384px (Naive Resize)"],
|
| "description": {
|
| "name": "DINOv2 + SigLIP 384px (Naive Resize)",
|
| "optimization_procedure": "single-stage",
|
| "visual_representation": "DINOv2 ViT-L/14 + SigLIP ViT-L/14 @ 384px",
|
| "image_processing": "Naive Resize",
|
| "language_model": "Vicuña v1.5 7B",
|
| "datasets": ["LLaVa v1.5 Instruct"],
|
| "train_epochs": 1,
|
| }
|
| },
|
|
|
|
|
| "llama2+7b": {
|
| "model_id": "llama2+7b",
|
| "names": ["Llama-2 7B"],
|
| "description": {
|
| "name": "Llama-2 7B",
|
| "optimization_procedure": "single-stage",
|
| "visual_representation": "CLIP ViT-L/14 @ 336px",
|
| "image_processing": "Letterbox",
|
| "language_model": "Llama-2 7B",
|
| "datasets": ["LLaVa v1.5 Instruct"],
|
| "train_epochs": 1,
|
| },
|
| },
|
| "llama2+13b": {
|
| "model_id": "llama2+13b",
|
| "names": ["Llama-2 13B"],
|
| "description": {
|
| "name": "Llama-2 13B",
|
| "optimization_procedure": "single-stage",
|
| "visual_representation": "CLIP ViT-L/14 @ 336px",
|
| "image_processing": "Letterbox",
|
| "language_model": "Llama-2 13B",
|
| "datasets": ["LLaVa v1.5 Instruct"],
|
| "train_epochs": 1,
|
| },
|
| },
|
|
|
| "vicuna-no-cotraining+7b": {
|
| "model_id": "vicuna-no-cotraining+7b",
|
| "names": ["Vicuña v1.5 7B (No Co-training)"],
|
| "description": {
|
| "name": "Vicuña v1.5 7B (No Co-training)",
|
| "optimization_procedure": "single-stage",
|
| "visual_representation": "CLIP ViT-L/14 @ 336px",
|
| "image_processing": "Letterbox",
|
| "language_model": "Vicuña v1.5 7B",
|
| "datasets": ["LLaVa v1.5 Multimodal-Only"],
|
| "train_epochs": 1,
|
| },
|
| },
|
| "llama2-no-cotraining+7b": {
|
| "model_id": "llama2-no-cotraining+7b",
|
| "names": ["Llama-2 7B (No Co-training)"],
|
| "description": {
|
| "name": "Llama-2 7B (No Co-training)",
|
| "optimization_procedure": "single-stage",
|
| "visual_representation": "CLIP ViT-L/14 @ 336px",
|
| "image_processing": "Letterbox",
|
| "language_model": "Llama-2 7B",
|
| "datasets": ["LLaVa v1.5 Multimodal-Only"],
|
| "train_epochs": 1,
|
| },
|
| },
|
|
|
|
|
| "train-1.25-epochs+7b": {
|
| "model_id": "train-1.25-epochs+7b",
|
| "names": ["1.25 Epochs"],
|
| "description": {
|
| "name": "1.25 Epochs",
|
| "optimization_procedure": "single-stage",
|
| "visual_representation": "CLIP ViT-L/14 @ 336px",
|
| "image_processing": "Letterbox",
|
| "language_model": "Vicuña v1.5 7B",
|
| "datasets": ["LLaVa v1.5 Instruct"],
|
| "train_epochs": 1.25,
|
| }
|
| },
|
| "train-1.5-epochs+7b": {
|
| "model_id": "train-1.5-epochs+7b",
|
| "names": ["1.5 Epochs"],
|
| "description": {
|
| "name": "1.5 Epochs",
|
| "optimization_procedure": "single-stage",
|
| "visual_representation": "CLIP ViT-L/14 @ 336px",
|
| "image_processing": "Letterbox",
|
| "language_model": "Vicuña v1.5 7B",
|
| "datasets": ["LLaVa v1.5 Instruct"],
|
| "train_epochs": 1.5,
|
| }
|
| },
|
| "train-2-epochs+7b": {
|
| "model_id": "train-2-epochs+7b",
|
| "names": ["2 Epochs"],
|
| "description": {
|
| "name": "2 Epochs",
|
| "optimization_procedure": "single-stage",
|
| "visual_representation": "CLIP ViT-L/14 @ 336px",
|
| "image_processing": "Letterbox",
|
| "language_model": "Vicuña v1.5 7B",
|
| "datasets": ["LLaVa v1.5 Instruct"],
|
| "train_epochs": 2,
|
| }
|
| },
|
| "train-3-epochs+7b": {
|
| "model_id": "train-3-epochs+7b",
|
| "names": ["3 Epochs"],
|
| "description": {
|
| "name": "3 Epochs",
|
| "optimization_procedure": "single-stage",
|
| "visual_representation": "CLIP ViT-L/14 @ 336px",
|
| "image_processing": "Letterbox",
|
| "language_model": "Vicuña v1.5 7B",
|
| "datasets": ["LLaVa v1.5 Instruct"],
|
| "train_epochs": 3,
|
| }
|
| },
|
|
|
| "llava-lvis4v+7b": {
|
| "model_id": "llava-lvis4v+7b",
|
| "names": ["Base + LVIS-4V"],
|
| "description": {
|
| "name": "Base + LVIS-4V",
|
| "optimization_procedure": "single-stage",
|
| "visual_representation": "CLIP ViT-L/14 @ 336px",
|
| "image_processing": "Letterbox",
|
| "language_model": "Vicuña v1.5 7B",
|
| "datasets": ["LLaVa v1.5 Instruct", "LVIS-Instruct-4V"],
|
| "train_epochs": 1,
|
| }
|
| },
|
| "llava-lrv+7b": {
|
| "model_id": "llava-lrv+7b",
|
| "names": ["Base + LRV"],
|
| "description": {
|
| "name": "Base + LRV",
|
| "optimization_procedure": "single-stage",
|
| "visual_representation": "CLIP ViT-L/14 @ 336px",
|
| "image_processing": "Letterbox",
|
| "language_model": "Vicuña v1.5 7B",
|
| "datasets": ["LLaVa v1.5 Instruct", "LRV-Instruct"],
|
| "train_epochs": 1,
|
| }
|
| },
|
| "llava-lvis4v-lrv+7b": {
|
| "model_id": "llava-lvis4v-lrv+7b",
|
| "names": ["Base + LVIS-4V + LRV"],
|
| "description": {
|
| "name": "Base + LVIS-4V + LRV",
|
| "optimization_procedure": "single-stage",
|
| "visual_representation": "CLIP ViT-L/14 @ 336px",
|
| "image_processing": "Letterbox",
|
| "language_model": "Vicuña v1.5 7B",
|
| "datasets": ["LLaVa v1.5 Instruct", "LVIS-Instruct-4V", "LRV-Instruct"],
|
| "train_epochs": 1,
|
| }
|
| },
|
|
|
|
|
|
|
|
|
| "prism-clip-controlled+7b": {
|
| "model_id": "prism-clip-controlled+7b",
|
| "names": ["Prism-CLIP 7B (Controlled)"],
|
| "description": {
|
| "name": "CLIP Prism 7B (Controlled)",
|
| "optimization_procedure": "single-stage",
|
| "visual_representation": "CLIP ViT-L/14 @ 336px",
|
| "image_processing": "Naive Resize",
|
| "language_model": "Llama-2 7B",
|
| "datasets": ["LLaVa v1.5 Instruct"],
|
| "train_epochs": 1,
|
| }
|
| },
|
| "prism-clip-controlled+13b": {
|
| "model_id": "prism-clip-controlled+13b",
|
| "names": ["Prism-CLIP 13B (Controlled)"],
|
| "description": {
|
| "name": "CLIP Prism 13B (Controlled)",
|
| "optimization_procedure": "single-stage",
|
| "visual_representation": "CLIP ViT-L/14 @ 336px",
|
| "image_processing": "Naive Resize",
|
| "language_model": "Llama-2 13B",
|
| "datasets": ["LLaVa v1.5 Instruct"],
|
| "train_epochs": 1,
|
| }
|
| },
|
| "prism-clip+7b": {
|
| "model_id": "prism-clip+7b",
|
| "names": ["Prism-CLIP 7B"],
|
| "description": {
|
| "name": "CLIP Prism 7B",
|
| "optimization_procedure": "single-stage",
|
| "visual_representation": "CLIP ViT-L/14 @ 336px",
|
| "image_processing": "Naive Resize",
|
| "language_model": "Llama-2 7B",
|
| "datasets": ["LLaVa v1.5 Instruct", "LVIS-Instruct-4V", "LRV-Instruct"],
|
| "train_epochs": 2,
|
| },
|
| },
|
| "prism-clip+13b": {
|
| "model_id": "prism-clip+13b",
|
| "names": ["Prism-CLIP 13B"],
|
| "description": {
|
| "name": "CLIP Prism 13B",
|
| "optimization_procedure": "single-stage",
|
| "visual_representation": "CLIP ViT-L/14 @ 336px",
|
| "image_processing": "Naive Resize",
|
| "language_model": "Llama-2 13B",
|
| "datasets": ["LLaVa v1.5 Instruct", "LVIS-Instruct-4V", "LRV-Instruct"],
|
| "train_epochs": 2,
|
| },
|
| },
|
|
|
|
|
| "prism-siglip-controlled+7b": {
|
| "model_id": "prism-siglip-controlled+7b",
|
| "names": ["Prism-SigLIP 7B (Controlled)"],
|
| "description": {
|
| "name": "SigLIP Prism 7B (Controlled)",
|
| "optimization_procedure": "single-stage",
|
| "visual_representation": "SigLIP ViT-SO/14 @ 384px",
|
| "image_processing": "Naive Resize",
|
| "language_model": "Llama-2 7B",
|
| "datasets": ["LLaVa v1.5 Instruct"],
|
| "train_epochs": 1,
|
| }
|
| },
|
| "prism-siglip-controlled+13b": {
|
| "model_id": "prism-siglip-controlled+7b",
|
| "names": ["Prism-SigLIP 13B (Controlled)"],
|
| "description": {
|
| "name": "SigLIP Prism 13B (Controlled)",
|
| "optimization_procedure": "single-stage",
|
| "visual_representation": "SigLIP ViT-SO/14 @ 384px",
|
| "image_processing": "Naive Resize",
|
| "language_model": "Llama-2 13B",
|
| "datasets": ["LLaVa v1.5 Instruct"],
|
| "train_epochs": 1,
|
| }
|
| },
|
| "prism-siglip+7b": {
|
| "model_id": "prism-siglip+7b",
|
| "names": ["Prism-SigLIP 7B"],
|
| "description": {
|
| "name": "SigLIP Prism 7B",
|
| "optimization_procedure": "single-stage",
|
| "visual_representation": "SigLIP ViT-SO/14 @ 384px",
|
| "image_processing": "Naive Resize",
|
| "language_model": "Llama-2 7B",
|
| "datasets": ["LLaVa v1.5 Instruct", "LVIS-Instruct-4V", "LRV-Instruct"],
|
| "train_epochs": 2,
|
| }
|
| },
|
| "prism-siglip+13b": {
|
| "model_id": "prism-siglip+13b",
|
| "names": ["Prism-SigLIP 13B"],
|
| "description": {
|
| "name": "SigLIP Prism 13B",
|
| "optimization_procedure": "single-stage",
|
| "visual_representation": "SigLIP ViT-SO/14 @ 384px",
|
| "image_processing": "Naive Resize",
|
| "language_model": "Llama-2 13B",
|
| "datasets": ["LLaVa v1.5 Instruct", "LVIS-Instruct-4V", "LRV-Instruct"],
|
| "train_epochs": 2,
|
| }
|
| },
|
|
|
|
|
| "prism-dinosiglip-controlled+7b": {
|
| "model_id": "prism-dinosiglip-controlled+7b",
|
| "names": ["Prism-DINOSigLIP 7B (Controlled)", "Prism 7B (Controlled)"],
|
| "description": {
|
| "name": "DINOSigLIP Prism 7B (Controlled)",
|
| "optimization_procedure": "single-stage",
|
| "visual_representation": "DINOv2 ViT-L/14 + SigLIP ViT-SO/14 @ 384px",
|
| "image_processing": "Naive Resize",
|
| "language_model": "Llama-2 7B",
|
| "datasets": ["LLaVa v1.5 Instruct"],
|
| "train_epochs": 1,
|
| }
|
| },
|
| "prism-dinosiglip-controlled+13b": {
|
| "model_id": "prism-dinosiglip-controlled+13b",
|
| "names": ["Prism-DINOSigLIP 13B (Controlled)", "Prism 13B (Controlled)"],
|
| "description": {
|
| "name": "DINOSigLIP Prism 13B (Controlled)",
|
| "optimization_procedure": "single-stage",
|
| "visual_representation": "DINOv2 ViT-L/14 + SigLIP ViT-SO/14 @ 384px",
|
| "image_processing": "Naive Resize",
|
| "language_model": "Llama-2 13B",
|
| "datasets": ["LLaVa v1.5 Instruct"],
|
| "train_epochs": 1,
|
| }
|
| },
|
| "prism-dinosiglip+7b": {
|
| "model_id": "prism-dinosiglip+7b",
|
| "names": ["Prism-DINOSigLIP 7B"],
|
| "description": {
|
| "name": "DINOSigLIP Prism 7B",
|
| "optimization_procedure": "single-stage",
|
| "visual_representation": "DINOv2 ViT-L/14 + SigLIP ViT-SO/14 @ 384px",
|
| "image_processing": "Naive Resize",
|
| "language_model": "Llama-2 7B",
|
| "datasets": ["LLaVa v1.5 Instruct", "LVIS-Instruct-4V", "LRV-Instruct"],
|
| "train_epochs": 2,
|
| },
|
| },
|
| "prism-dinosiglip+13b": {
|
| "model_id": "prism-dinosiglip+13b",
|
| "names": ["Prism-DINOSigLIP 13B"],
|
| "description": {
|
| "name": "DINOSigLIP Prism 13B",
|
| "optimization_procedure": "single-stage",
|
| "visual_representation": "DINOv2 ViT-L/14 + SigLIP ViT-SO/14 @ 384px",
|
| "image_processing": "Naive Resize",
|
| "language_model": "Llama-2 13B",
|
| "datasets": ["LLaVa v1.5 Instruct", "LVIS-Instruct-4V", "LRV-Instruct"],
|
| "train_epochs": 2,
|
| },
|
| },
|
|
|
|
|
| "prism-dinosiglip-224px-controlled+7b": {
|
| "model_id": "prism-dinosiglip-224px-controlled+7b",
|
| "names": ["Prism-DINOSigLIP 224px 7B (Controlled)"],
|
| "description": {
|
| "name": "DINOSigLIP 224px 7B (Controlled)",
|
| "optimization_procedure": "single-stage",
|
| "visual_representation": "DINOv2 ViT-L/14 + SigLIP ViT-SO 14 @ 224px",
|
| "image_processing": "Naive Resize",
|
| "language_model": "Llama-2 7B",
|
| "datasets": ["LLaVa v1.5 Instruct"],
|
| "train_epochs": 1,
|
| }
|
| },
|
| "prism-dinosiglip-224px+7b": {
|
| "model_id": "prism-dinosiglip-224px+7b",
|
| "names": ["Prism-DINOSigLIP 224px 7B"],
|
| "description": {
|
| "name": "DINOSigLIP 224px 7B",
|
| "optimization_procedure": "single-stage",
|
| "visual_representation": "DINOv2 ViT-L/14 + SigLIP ViT-SO 14 @ 224px",
|
| "image_processing": "Naive Resize",
|
| "language_model": "Llama-2 7B",
|
| "datasets": ["LLaVa v1.5 Instruct", "LVIS-Instruct-4V", "LRV-Instruct"],
|
| "train_epochs": 2,
|
| }
|
| },
|
|
|
|
|
| "llama2-chat+7b": {
|
| "model_id": "llama2-chat+7b",
|
| "names": ["Llama-2 Chat 7B"],
|
| "description": {
|
| "name": "Llama-2 Chat 7B",
|
| "optimization_procedure": "single-stage",
|
| "visual_representation": "CLIP ViT-L/14 @ 336px",
|
| "image_processing": "Letterbox",
|
| "language_model": "Llama-2 Chat 7B",
|
| "datasets": ["LLaVa v1.5 Instruct"],
|
| "train_epochs": 1,
|
| }
|
| },
|
| "llama2-chat+13b": {
|
| "model_id": "llama2-chat+13b",
|
| "names": ["Llama-2 Chat 13B"],
|
| "description": {
|
| "name": "Llama-2 Chat 13B",
|
| "optimization_procedure": "single-stage",
|
| "visual_representation": "CLIP ViT-L/14 @ 336px",
|
| "image_processing": "Letterbox",
|
| "language_model": "Llama-2 Chat 13B",
|
| "datasets": ["LLaVa v1.5 Instruct"],
|
| "train_epochs": 1,
|
| }
|
| },
|
| "mistral-v0.1+7b": {
|
| "model_id": "mistral-v0.1+7b",
|
| "names": ["Mistral v0.1 7B"],
|
| "description": {
|
| "name": "Mistral v0.1 7B",
|
| "optimization_procedure": "single-stage",
|
| "visual_representation": "CLIP ViT-L/14 @ 336px",
|
| "image_processing": "Letterbox",
|
| "language_model": "Mistral v0.1 7B",
|
| "datasets": ["LLaVa v1.5 Instruct"],
|
| "train_epochs": 1,
|
| }
|
| },
|
| "mistral-instruct-v0.1+7b": {
|
| "model_id": "mistral-instruct-v0.1+7b",
|
| "names": ["Mistral Instruct v0.1 7B"],
|
| "description": {
|
| "name": "Mistral Instruct v0.1 7B",
|
| "optimization_procedure": "single-stage",
|
| "visual_representation": "CLIP ViT-L/14 @ 336px",
|
| "image_processing": "Letterbox",
|
| "language_model": "Mistral Instruct v0.1 7B",
|
| "datasets": ["LLaVa v1.5 Instruct"],
|
| "train_epochs": 1,
|
| }
|
| },
|
| "phi-2+3b": {
|
| "model_id": "phi-2+3b",
|
| "names": ["Phi-2 3B"],
|
| "description": {
|
| "name": "Phi-2 3B",
|
| "optimization_procedure": "single-stage",
|
| "visual_representation": "CLIP ViT-L/14 @ 336px",
|
| "image_processing": "Letterbox",
|
| "language_model": "Phi-2 3B",
|
| "datasets": ["LLaVa v1.5 Instruct"],
|
| "train_epochs": 1,
|
| }
|
| },
|
| }
|
|
|
|
|
| GLOBAL_REGISTRY = {name: v for k, v in MODEL_REGISTRY.items() for name in [k] + v["names"]}
|
|
|
|
|
|
|