model: model: SanaMSVideoCamCtrl_1600M_P1_D20 image_size: 720 aspect_ratio_type: ASPECT_RATIO_VIDEO_720_MS_DIV32 mixed_precision: bf16 fp32_attention: true multi_scale: true camctrl_type: BidirectionalGDNUCPESinglePathLiteLABothTriton attn_type: BidirectionalGDNTriton softmax_every_n: 4 linear_head_dim: 112 conv_kernel_size: 4 k_conv_only: true ffn_type: GLUMBConvTemp t_kernel_size: 3 mlp_acts: - silu - silu - mlp_ratio: 3 use_pe: true pos_embed_type: wan_rope qk_norm: true cross_norm: true class_dropout_prob: 0.0 chunk_split_strategy: first_chunk_plus_one cam_attn_compress: 1 init_cam_from_base: true use_chunk_plucker_post_attn: true chunk_plucker_channels: 48 chunk_plucker_post_attn_blocks: 20 vae: vae_type: LTX2VAE_diffusers # AutoencoderKLLTX2Video.from_pretrained(, subfolder="vae") expects # a parent directory containing a ``vae/`` diffusers folder. The public # release repo hosts that ``vae/`` folder at its root. vae_pretrained: hf://Efficient-Large-Model/SANA-WM_bidirectional weight_dtype: bfloat16 vae_latent_dim: 128 vae_downsample_rate: 32 vae_stride: [8, 32, 32] use_framewise_encoding: true use_framewise_decoding: true tile_sample_stride_num_frames: 64 tile_sample_min_num_frames: 96 text_encoder: text_encoder_name: gemma-2-2b-it y_norm: true y_norm_scale_factor: 0.01 model_max_length: 300 chi_prompt: - 'Given a user prompt, generate an "Enhanced prompt" that provides detailed visual descriptions suitable for image generation. Evaluate the level of detail in the user prompt:' - '- If the prompt is simple, focus on adding specifics about colors, shapes, sizes, textures, and spatial relationships to create vivid and concrete scenes.' - '- If the prompt is already detailed, refine and enhance the existing details slightly without overcomplicating.' - 'Here are examples of how to transform or refine prompts:' - '- User Prompt: A cat sleeping -> Enhanced: A small, fluffy white cat curled up in a round shape, sleeping peacefully on a warm sunny windowsill, surrounded by pots of blooming red flowers.' - '- User Prompt: A busy city street -> Enhanced: A bustling city street scene at dusk, featuring glowing street lamps, a diverse crowd of people in colorful clothing, and a double-decker bus passing by towering glass skyscrapers.' - 'Please generate only the enhanced description for the prompt below and avoid including any additional commentary or evaluations:' - 'User Prompt: ' scheduler: predict_flow_v: true noise_schedule: linear_flow pred_sigma: false flow_shift: 9.95 inference_flow_shift: 9.8 vis_sampler: flow_dpm-solver