| { |
| "model_type": "s1-omni-image", |
| "architectures": [ |
| "S1OmniImageModel" |
| ], |
| "prefixes": { |
| "qwen3_vl": "qwen3_vl.", |
| "transformer": "transformer.", |
| "vae": "vae.", |
| "alignment_mlp": "alignment_mlp." |
| }, |
| "qwen3_vl_config": { |
| "architectures": [ |
| "Qwen3VLForConditionalGeneration" |
| ], |
| "dtype": "bfloat16", |
| "eos_token_id": 151645, |
| "hidden_size": 4096, |
| "image_token_id": 151655, |
| "model_type": "qwen3_vl", |
| "pad_token_id": 151643, |
| "text_config": { |
| "attention_bias": false, |
| "attention_dropout": 0.0, |
| "bos_token_id": 151643, |
| "dtype": "bfloat16", |
| "eos_token_id": 151645, |
| "head_dim": 128, |
| "hidden_act": "silu", |
| "hidden_size": 4096, |
| "initializer_range": 0.02, |
| "intermediate_size": 12288, |
| "max_position_embeddings": 262144, |
| "model_type": "qwen3_vl_text", |
| "num_attention_heads": 32, |
| "num_hidden_layers": 36, |
| "num_key_value_heads": 8, |
| "pad_token_id": 151643, |
| "rms_norm_eps": 1e-06, |
| "rope_scaling": { |
| "mrope_interleaved": true, |
| "mrope_section": [ |
| 24, |
| 20, |
| 20 |
| ], |
| "rope_type": "default" |
| }, |
| "rope_theta": 5000000, |
| "use_cache": false, |
| "vocab_size": 151936 |
| }, |
| "tie_word_embeddings": false, |
| "transformers_version": "4.57.6", |
| "video_token_id": 151656, |
| "vision_config": { |
| "deepstack_visual_indexes": [ |
| 8, |
| 16, |
| 24 |
| ], |
| "depth": 27, |
| "dtype": "bfloat16", |
| "hidden_act": "gelu_pytorch_tanh", |
| "hidden_size": 1152, |
| "in_channels": 3, |
| "initializer_range": 0.02, |
| "intermediate_size": 4304, |
| "model_type": "qwen3_vl", |
| "num_heads": 16, |
| "num_position_embeddings": 2304, |
| "out_hidden_size": 4096, |
| "pad_token_id": 151643, |
| "patch_size": 16, |
| "spatial_merge_size": 2, |
| "temporal_patch_size": 2 |
| }, |
| "vision_end_token_id": 151653, |
| "vision_start_token_id": 151652 |
| }, |
| "transformer_config": { |
| "_class_name": "QwenImageTransformer2DModel", |
| "_diffusers_version": "0.36.0.dev0", |
| "attention_head_dim": 128, |
| "axes_dims_rope": [ |
| 16, |
| 56, |
| 56 |
| ], |
| "guidance_embeds": false, |
| "in_channels": 64, |
| "joint_attention_dim": 3584, |
| "num_attention_heads": 24, |
| "num_layers": 60, |
| "out_channels": 16, |
| "patch_size": 2, |
| "zero_cond_t": true |
| }, |
| "vae_config": { |
| "_class_name": "AutoencoderKLQwenImage", |
| "_diffusers_version": "0.36.0.dev0", |
| "attn_scales": [], |
| "base_dim": 96, |
| "dim_mult": [ |
| 1, |
| 2, |
| 4, |
| 4 |
| ], |
| "dropout": 0.0, |
| "latents_mean": [ |
| -0.7571, |
| -0.7089, |
| -0.9113, |
| 0.1075, |
| -0.1745, |
| 0.9653, |
| -0.1517, |
| 1.5508, |
| 0.4134, |
| -0.0715, |
| 0.5517, |
| -0.3632, |
| -0.1922, |
| -0.9497, |
| 0.2503, |
| -0.2921 |
| ], |
| "latents_std": [ |
| 2.8184, |
| 1.4541, |
| 2.3275, |
| 2.6558, |
| 1.2196, |
| 1.7708, |
| 2.6052, |
| 2.0743, |
| 3.2687, |
| 2.1526, |
| 2.8652, |
| 1.5579, |
| 1.6382, |
| 1.1253, |
| 2.8251, |
| 1.916 |
| ], |
| "num_res_blocks": 2, |
| "temperal_downsample": [ |
| false, |
| true, |
| true |
| ], |
| "z_dim": 16 |
| }, |
| "scheduler_config": { |
| "_class_name": "FlowMatchEulerDiscreteScheduler", |
| "_diffusers_version": "0.36.0.dev0", |
| "base_image_seq_len": 256, |
| "base_shift": 0.5, |
| "invert_sigmas": false, |
| "max_image_seq_len": 8192, |
| "max_shift": 0.9, |
| "num_train_timesteps": 1000, |
| "shift": 1.0, |
| "shift_terminal": 0.02, |
| "stochastic_sampling": false, |
| "time_shift_type": "exponential", |
| "use_beta_sigmas": false, |
| "use_dynamic_shifting": true, |
| "use_exponential_sigmas": false, |
| "use_karras_sigmas": false |
| }, |
| "alignment_mlp": { |
| "input_dim": 4096, |
| "output_dim": 3584, |
| "hidden_dim": 4096, |
| "type": "mlp" |
| }, |
| "special_tokens": { |
| "think_start": "<think>", |
| "think_end": "</think>", |
| "image_gen_start": "<image_gen>", |
| "image_gen_end": "</image_gen>", |
| "image_edit_start": "<image_edit>", |
| "image_edit_end": "</image_edit>" |
| }, |
| "generation": { |
| "max_new_tokens": 2048, |
| "temperature": 0.7, |
| "top_p": 0.9, |
| "do_sample": true |
| }, |
| "image_generation": { |
| "default_height": 1024, |
| "default_width": 1024, |
| "num_inference_steps": 50, |
| "guidance_scale": 1.0, |
| "true_cfg_scale": 4.0 |
| } |
| } |