Image-Text-to-Text
MLX
Safetensors
zaya1_vl
zaya
mixture-of-experts
hybrid-attention
cca-attention
apple-silicon
reasoning
tool-use
quantized
vision
multimodal
vision-language
qwen2_5_vl-vit
mxfp4
jang
osaurus
conversational
Instructions to use OsaurusAI/ZAYA1-VL-8B-MXFP4 with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- MLX
How to use OsaurusAI/ZAYA1-VL-8B-MXFP4 with MLX:
# Make sure mlx-vlm is installed # pip install --upgrade mlx-vlm from mlx_vlm import load, generate from mlx_vlm.prompt_utils import apply_chat_template from mlx_vlm.utils import load_config # Load the model model, processor = load("OsaurusAI/ZAYA1-VL-8B-MXFP4") config = load_config("OsaurusAI/ZAYA1-VL-8B-MXFP4") # Prepare input image = ["http://images.cocodataset.org/val2017/000000039769.jpg"] prompt = "Describe this image." # Apply chat template formatted_prompt = apply_chat_template( processor, config, prompt, num_images=1 ) # Generate output output = generate(model, processor, formatted_prompt, image) print(output) - Notebooks
- Google Colab
- Kaggle
- Local Apps
- LM Studio
| { | |
| "activation_func": "swiglu", | |
| "activation_func_fp8_input_store": false, | |
| "add_bias_linear": false, | |
| "apply_rope_fusion": true, | |
| "ar_threshold": 1, | |
| "architectures": [ | |
| "Zaya1VLForConditionalGeneration" | |
| ], | |
| "attention_bias": false, | |
| "bias_activation_fusion": true, | |
| "bos_token_id": 2, | |
| "cca": true, | |
| "clamp_temp": false, | |
| "eos_token_id": 262143, | |
| "ffn_hidden_size": 4096, | |
| "fused_add_norm": false, | |
| "gated_linear_unit": true, | |
| "hidden_size": 2048, | |
| "head_dim": 128, | |
| "image_token_id": 262147, | |
| "lm_head_bias": false, | |
| "lora_rank": 0, | |
| "max_position_embeddings": 32768, | |
| "model_type": "zaya1_vl", | |
| "moe_router_topk": 1, | |
| "norm_epsilon": 1e-05, | |
| "normalization": "RMSNorm", | |
| "num_attention_heads": 8, | |
| "num_experts": 16, | |
| "num_hidden_layers": 40, | |
| "num_key_value_heads": 2, | |
| "num_query_groups": 2, | |
| "pad_token_id": 0, | |
| "padding_side": "right", | |
| "projector_hidden_act": "gelu", | |
| "residual_in_fp32": false, | |
| "rope_pct": 0.5, | |
| "rotary_base": 1000000, | |
| "scale_residual_merge": true, | |
| "sliding_window": null, | |
| "temporal_patch_size": 1, | |
| "tie_word_embeddings": true, | |
| "torch_dtype": "bfloat16", | |
| "transformers_version": "4.57.1", | |
| "use_lora_att": false, | |
| "use_rope_scaling": false, | |
| "vision_config": { | |
| "_attn_implementation_autoset": true, | |
| "hidden_size": 1280, | |
| "in_chans": 3, | |
| "model_type": "qwen2_5_vl", | |
| "out_hidden_size": 2048, | |
| "spatial_patch_size": 14, | |
| "temporal_patch_size": 1, | |
| "tokens_per_second": 2, | |
| "torch_dtype": "bfloat16" | |
| }, | |
| "vision_end_token_id": 256000, | |
| "vision_lora": true, | |
| "vision_lora_rank_attn": 8, | |
| "vision_lora_rank_mlp": 32, | |
| "vision_start_token_id": 255999, | |
| "vocab_size": 262272, | |
| "zaya_mlp_expansion": 256, | |
| "zaya_use_eda": true, | |
| "zaya_use_mod": true, | |
| "weight_format": "mxfp4", | |
| "zaya_expert_layout": "split_switch_mlp", | |
| "quantization": { | |
| "bits": 4, | |
| "group_size": 32, | |
| "mode": "affine", | |
| "router_bits": 16, | |
| "expert_layout": "split_switch_mlp" | |
| }, | |
| "capabilities": { | |
| "reasoning_parser": "qwen3", | |
| "tool_parser": "zaya_xml", | |
| "think_in_template": false, | |
| "supports_tools": true, | |
| "supports_thinking": true, | |
| "family": "zaya1_vl", | |
| "modality": "vision", | |
| "cache_type": "hybrid" | |
| } | |
| } | |