File size: 2,727 Bytes
6f60830
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
model:
  model: SanaMSVideoCamCtrl_1600M_P1_D20
  image_size: 720
  aspect_ratio_type: ASPECT_RATIO_VIDEO_720_MS_DIV32
  mixed_precision: bf16
  fp32_attention: true
  multi_scale: true
  camctrl_type: BidirectionalGDNUCPESinglePathLiteLABothTriton
  attn_type: BidirectionalGDNTriton
  softmax_every_n: 4
  linear_head_dim: 112
  conv_kernel_size: 4
  k_conv_only: true
  ffn_type: GLUMBConvTemp
  t_kernel_size: 3
  mlp_acts:
    - silu
    - silu
    -
  mlp_ratio: 3
  use_pe: true
  pos_embed_type: wan_rope
  qk_norm: true
  cross_norm: true
  class_dropout_prob: 0.0
  chunk_split_strategy: first_chunk_plus_one
  cam_attn_compress: 1
  init_cam_from_base: true
  use_chunk_plucker_post_attn: true
  chunk_plucker_channels: 48
  chunk_plucker_post_attn_blocks: 20

vae:
  vae_type: LTX2VAE_diffusers
  # AutoencoderKLLTX2Video.from_pretrained(<root>, subfolder="vae") expects
  # a parent directory containing a ``vae/`` diffusers folder. The public
  # release repo hosts that ``vae/`` folder at its root.
  vae_pretrained: hf://Efficient-Large-Model/SANA-WM_bidirectional
  weight_dtype: bfloat16
  vae_latent_dim: 128
  vae_downsample_rate: 32
  vae_stride: [8, 32, 32]
  use_framewise_encoding: true
  use_framewise_decoding: true
  tile_sample_stride_num_frames: 64
  tile_sample_min_num_frames: 96

text_encoder:
  text_encoder_name: gemma-2-2b-it
  y_norm: true
  y_norm_scale_factor: 0.01
  model_max_length: 300
  chi_prompt:
    - 'Given a user prompt, generate an "Enhanced prompt" that provides detailed visual descriptions suitable for image generation. Evaluate the level of detail in the user prompt:'
    - '- If the prompt is simple, focus on adding specifics about colors, shapes, sizes, textures, and spatial relationships to create vivid and concrete scenes.'
    - '- If the prompt is already detailed, refine and enhance the existing details slightly without overcomplicating.'
    - 'Here are examples of how to transform or refine prompts:'
    - '- User Prompt: A cat sleeping -> Enhanced: A small, fluffy white cat curled up in a round shape, sleeping peacefully on a warm sunny windowsill, surrounded by pots of blooming red flowers.'
    - '- User Prompt: A busy city street -> Enhanced: A bustling city street scene at dusk, featuring glowing street lamps, a diverse crowd of people in colorful clothing, and a double-decker bus passing by towering glass skyscrapers.'
    - 'Please generate only the enhanced description for the prompt below and avoid including any additional commentary or evaluations:'
    - 'User Prompt: '

scheduler:
  predict_flow_v: true
  noise_schedule: linear_flow
  pred_sigma: false
  flow_shift: 9.95
  inference_flow_shift: 9.8
  vis_sampler: flow_dpm-solver