| CACHE_DIR=None |
| PRETRAINED_MODEL=None |
|
|
| num_frames = 16 |
| frame_interval = 1 |
| image_size = (256, 256) |
| fps = 30//2 |
|
|
| |
| root = None |
| data_path = "CSV_PATH" |
| use_image_transform = False |
| num_workers = 6 |
|
|
| |
| dtype = "fp16" |
| grad_checkpoint = False |
| plugin = "zero2" |
| sp_size = 1 |
| data_prefetch = 1 |
|
|
|
|
| MODEL_DIM = 1152 |
| CAMERA_FORMAT = 'extrinsic' |
| CAMERA_PARAMS_NUM = 12 |
|
|
| |
| model = dict( |
| type="STDiT-XL/2", |
| space_scale=0.5, |
| time_scale=1.0, |
| from_pretrained=PRETRAINED_MODEL, |
| enable_flashattn=True, |
| enable_layernorm_kernel=True, |
| camera_fuser_linear_dims=[MODEL_DIM+CAMERA_PARAMS_NUM, MODEL_DIM], |
| camera_format=CAMERA_FORMAT |
| ) |
| vae = dict( |
| type="VideoAutoencoderKL", |
| from_pretrained="stabilityai/sd-vae-ft-ema", |
| cache_dir=CACHE_DIR, |
| ) |
| text_encoder = dict( |
| type="t5", |
| from_pretrained="DeepFloyd/t5-v1_1-xxl", |
| model_max_length=120, |
| shardformer=True, |
| cache_dir=CACHE_DIR, |
| ) |
| scheduler = dict( |
| type="iddpm_camera", |
| |
| cfg_scale_t=6.0, |
| cfg_scale_c=4.0 |
| ) |
|
|
| |
| seed = 42 |
| wandb = True |
|
|
| epochs = 12 |
| log_every = 300 |
| ckpt_every = 2000 |
|
|
| dataset = dict( |
| text_dropout=0.05, |
| camera_dropout=0.05, |
| static_camera_rate=0.0, |
| resolution=256, |
| version='v0.7', |
| frame_strides=[4, 5, 6, 7, 8], |
| plucker_coord=False, |
| expand_rt=False |
| ) |
|
|
| load = None |
| batch_size = 6 |
| lr = 1e-5 |
| grad_clip = 1.0 |
| freeze_model = True |
| active_layer_names = ['camera_fuser', 'attn_temp'] |
|
|
| |
| prompt_path = "./assets/texts/realestate10k.txt" |
| |
|
|
| camera_path = '' |
| nprompts = None |
| save_dir = None |