Efficient-Large-Model
/

SANA-WM_bidirectional

Model card Files Files and versions

SANA-WM_bidirectional / config.yaml

HaoyiZhu's picture

Upload config.yaml with huggingface_hub

6f60830 verified 4 days ago

history blame contribute delete

2.73 kB

	model:
	model: SanaMSVideoCamCtrl_1600M_P1_D20
	image_size: 720
	aspect_ratio_type: ASPECT_RATIO_VIDEO_720_MS_DIV32
	mixed_precision: bf16
	fp32_attention: true
	multi_scale: true
	camctrl_type: BidirectionalGDNUCPESinglePathLiteLABothTriton
	attn_type: BidirectionalGDNTriton
	softmax_every_n: 4
	linear_head_dim: 112
	conv_kernel_size: 4
	k_conv_only: true
	ffn_type: GLUMBConvTemp
	t_kernel_size: 3
	mlp_acts:
	- silu
	- silu
	-
	mlp_ratio: 3
	use_pe: true
	pos_embed_type: wan_rope
	qk_norm: true
	cross_norm: true
	class_dropout_prob: 0.0
	chunk_split_strategy: first_chunk_plus_one
	cam_attn_compress: 1
	init_cam_from_base: true
	use_chunk_plucker_post_attn: true
	chunk_plucker_channels: 48
	chunk_plucker_post_attn_blocks: 20

	vae:
	vae_type: LTX2VAE_diffusers
	# AutoencoderKLLTX2Video.from_pretrained(<root>, subfolder="vae") expects
	# a parent directory containing a ``vae/`` diffusers folder. The public
	# release repo hosts that ``vae/`` folder at its root.
	vae_pretrained: hf://Efficient-Large-Model/SANA-WM_bidirectional
	weight_dtype: bfloat16
	vae_latent_dim: 128
	vae_downsample_rate: 32
	vae_stride: [8, 32, 32]
	use_framewise_encoding: true
	use_framewise_decoding: true
	tile_sample_stride_num_frames: 64
	tile_sample_min_num_frames: 96

	text_encoder:
	text_encoder_name: gemma-2-2b-it
	y_norm: true
	y_norm_scale_factor: 0.01
	model_max_length: 300
	chi_prompt:
	- 'Given a user prompt, generate an "Enhanced prompt" that provides detailed visual descriptions suitable for image generation. Evaluate the level of detail in the user prompt:'
	- '- If the prompt is simple, focus on adding specifics about colors, shapes, sizes, textures, and spatial relationships to create vivid and concrete scenes.'
	- '- If the prompt is already detailed, refine and enhance the existing details slightly without overcomplicating.'
	- 'Here are examples of how to transform or refine prompts:'
	- '- User Prompt: A cat sleeping -> Enhanced: A small, fluffy white cat curled up in a round shape, sleeping peacefully on a warm sunny windowsill, surrounded by pots of blooming red flowers.'
	- '- User Prompt: A busy city street -> Enhanced: A bustling city street scene at dusk, featuring glowing street lamps, a diverse crowd of people in colorful clothing, and a double-decker bus passing by towering glass skyscrapers.'
	- 'Please generate only the enhanced description for the prompt below and avoid including any additional commentary or evaluations:'
	- 'User Prompt: '

	scheduler:
	predict_flow_v: true
	noise_schedule: linear_flow
	pred_sigma: false
	flow_shift: 9.95
	inference_flow_shift: 9.8
	vis_sampler: flow_dpm-solver