Instructions to use SsharvienKumar/SWoMo with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Diffusers
How to use SsharvienKumar/SWoMo with Diffusers:
pip install -U diffusers transformers accelerate
import torch from diffusers import DiffusionPipeline from diffusers.utils import load_image, export_to_video # switch to "mps" for apple devices pipe = DiffusionPipeline.from_pretrained("SsharvienKumar/SWoMo", dtype=torch.bfloat16, device_map="cuda") pipe.to("cuda") prompt = "A man with short gray hair plays a red electric guitar." image = load_image( "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/guitar-man.png" ) output = pipe(image=image, prompt=prompt).frames[0] export_to_video(output, "output.mp4") - Notebooks
- Google Colab
- Kaggle
Upload 28 files
Browse files- checkpoints/Cataract-1K/controlnet_img_graph_vid/checkpoint/controlnet/config.json +77 -0
- checkpoints/Cataract-1K/controlnet_img_graph_vid/checkpoint/controlnet/diffusion_pytorch_model.safetensors +3 -0
- checkpoints/Cataract-1K/controlnet_img_graph_vid/checkpoint/optimizer.bin +3 -0
- checkpoints/Cataract-1K/controlnet_img_graph_vid/checkpoint/random_states_0.pkl +3 -0
- checkpoints/Cataract-1K/controlnet_img_graph_vid/checkpoint/scheduler.bin +3 -0
- checkpoints/Cataract-1K/controlnet_ximg_graph_vid/checkpoint/controlnet/config.json +76 -0
- checkpoints/Cataract-1K/controlnet_ximg_graph_vid/checkpoint/controlnet/diffusion_pytorch_model.safetensors +3 -0
- checkpoints/Cataract-1K/controlnet_ximg_graph_vid/checkpoint/optimizer.bin +3 -0
- checkpoints/Cataract-1K/controlnet_ximg_graph_vid/checkpoint/random_states_0.pkl +3 -0
- checkpoints/Cataract-1K/controlnet_ximg_graph_vid/checkpoint/scheduler.bin +3 -0
- checkpoints/Cataract-1K/diffusion_img_graph_xvid/checkpoint/optimizer.bin +3 -0
- checkpoints/Cataract-1K/diffusion_img_graph_xvid/checkpoint/random_states_0.pkl +3 -0
- checkpoints/Cataract-1K/diffusion_img_graph_xvid/checkpoint/scheduler.bin +3 -0
- checkpoints/Cataract-1K/diffusion_img_graph_xvid/checkpoint/unet/config.json +79 -0
- checkpoints/Cataract-1K/diffusion_img_graph_xvid/checkpoint/unet/diffusion_pytorch_model.safetensors +3 -0
- checkpoints/Cataract-1K/diffusion_ximg_graph_xvid/checkpoint/optimizer.bin +3 -0
- checkpoints/Cataract-1K/diffusion_ximg_graph_xvid/checkpoint/random_states_0.pkl +3 -0
- checkpoints/Cataract-1K/diffusion_ximg_graph_xvid/checkpoint/scheduler.bin +3 -0
- checkpoints/Cataract-1K/diffusion_ximg_graph_xvid/checkpoint/unet/config.json +79 -0
- checkpoints/Cataract-1K/diffusion_ximg_graph_xvid/checkpoint/unet/diffusion_pytorch_model.safetensors +3 -0
- checkpoints/Cataract-1K/graphencoder_masked/best_val_loss.pth +3 -0
- checkpoints/Cataract-1K/graphencoder_segclip/best_val_loss.pth +3 -0
- checkpoints/Cataract-1K/vae_vid_diffusion/vae/config.json +37 -0
- checkpoints/Cataract-1K/vae_vid_diffusion/vae/diffusion_pytorch_model.safetensors +3 -0
- checkpoints/Cataract-1K/vqgan_image/checkpoint.ckpt +3 -0
- checkpoints/Cataract-1K/vqgan_image/config.yaml +57 -0
- checkpoints/Cataract-1K/vqgan_segmentation/checkpoint.ckpt +3 -0
- checkpoints/Cataract-1K/vqgan_segmentation/config.yaml +52 -0
checkpoints/Cataract-1K/controlnet_img_graph_vid/checkpoint/controlnet/config.json
ADDED
|
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_class_name": "ControlNetModel",
|
| 3 |
+
"_diffusers_version": "0.21.2",
|
| 4 |
+
"_name_or_path": "./checkpoints/Cataract-1k/video_diffusion/surgsimbridge_training_img_graph_vid_cataract1k-2026-04-15T07-35-15/checkpoints/checkpoint-145000",
|
| 5 |
+
"act_fn": "silu",
|
| 6 |
+
"addition_embed_type": null,
|
| 7 |
+
"addition_embed_type_num_heads": 64,
|
| 8 |
+
"addition_time_embed_dim": null,
|
| 9 |
+
"attention_head_dim": [
|
| 10 |
+
5,
|
| 11 |
+
10,
|
| 12 |
+
20,
|
| 13 |
+
20
|
| 14 |
+
],
|
| 15 |
+
"attention_type": "default",
|
| 16 |
+
"augment_temporal_attention": true,
|
| 17 |
+
"block_out_channels": [
|
| 18 |
+
320,
|
| 19 |
+
640,
|
| 20 |
+
1280,
|
| 21 |
+
1280
|
| 22 |
+
],
|
| 23 |
+
"class_embed_type": "identity",
|
| 24 |
+
"class_embeddings_concat": true,
|
| 25 |
+
"conditioning_channels": 3,
|
| 26 |
+
"conditioning_embedding_out_channels": [
|
| 27 |
+
16,
|
| 28 |
+
32,
|
| 29 |
+
96,
|
| 30 |
+
256
|
| 31 |
+
],
|
| 32 |
+
"conv_in_kernel": 3,
|
| 33 |
+
"cross_attention_dim": 1024,
|
| 34 |
+
"cross_attention_norm": null,
|
| 35 |
+
"down_block_types": [
|
| 36 |
+
"CrossAttnDownBlock2D",
|
| 37 |
+
"CrossAttnDownBlock2D",
|
| 38 |
+
"CrossAttnDownBlock2D",
|
| 39 |
+
"DownBlock2D"
|
| 40 |
+
],
|
| 41 |
+
"downsample_padding": 1,
|
| 42 |
+
"dropout": 0.0,
|
| 43 |
+
"dual_cross_attention": false,
|
| 44 |
+
"encoder_hid_dim": null,
|
| 45 |
+
"encoder_hid_dim_type": null,
|
| 46 |
+
"first_frame_condition_mode": "concat",
|
| 47 |
+
"flip_sin_to_cos": true,
|
| 48 |
+
"freq_shift": 0,
|
| 49 |
+
"global_pool_conditions": false,
|
| 50 |
+
"in_channels": 4,
|
| 51 |
+
"layers_per_block": 2,
|
| 52 |
+
"mid_block_only_cross_attention": null,
|
| 53 |
+
"mid_block_scale_factor": 1,
|
| 54 |
+
"mid_block_type": "UNetMidBlock2DCrossAttn",
|
| 55 |
+
"n_frames": 16,
|
| 56 |
+
"n_temp_heads": 8,
|
| 57 |
+
"norm_eps": 1e-05,
|
| 58 |
+
"norm_num_groups": 32,
|
| 59 |
+
"num_attention_heads": null,
|
| 60 |
+
"num_class_embeds": null,
|
| 61 |
+
"only_cross_attention": false,
|
| 62 |
+
"projection_class_embeddings_input_dim": null,
|
| 63 |
+
"resnet_out_scale_factor": 1.0,
|
| 64 |
+
"resnet_skip_time_act": false,
|
| 65 |
+
"resnet_time_scale_shift": "default",
|
| 66 |
+
"temp_pos_embedding": "rotary",
|
| 67 |
+
"time_cond_proj_dim": null,
|
| 68 |
+
"time_embedding_act_fn": null,
|
| 69 |
+
"time_embedding_dim": 512,
|
| 70 |
+
"time_embedding_type": "positional",
|
| 71 |
+
"timestep_post_act": null,
|
| 72 |
+
"transformer_layers_per_block": 1,
|
| 73 |
+
"upcast_attention": false,
|
| 74 |
+
"use_frame_stride_condition": false,
|
| 75 |
+
"use_linear_projection": true,
|
| 76 |
+
"use_temporal": true
|
| 77 |
+
}
|
checkpoints/Cataract-1K/controlnet_img_graph_vid/checkpoint/controlnet/diffusion_pytorch_model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:820df30af2eb1f58e7482527ba44e6f032b207aa8158d2316bcd556c976ddc28
|
| 3 |
+
size 2102262840
|
checkpoints/Cataract-1K/controlnet_img_graph_vid/checkpoint/optimizer.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8354bc5f78800d0550eb25175805b996820198d470e8dd8d94b1f66af130d61b
|
| 3 |
+
size 4165435906
|
checkpoints/Cataract-1K/controlnet_img_graph_vid/checkpoint/random_states_0.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:496c5491d5471c8b259125aac6d453c89b669f87e41cb199b7cdac71d09de86f
|
| 3 |
+
size 15060
|
checkpoints/Cataract-1K/controlnet_img_graph_vid/checkpoint/scheduler.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:eaa9a799183c7caa950febe94a4dab7ab200474b49179a367833ffed6b68d3c1
|
| 3 |
+
size 1000
|
checkpoints/Cataract-1K/controlnet_ximg_graph_vid/checkpoint/controlnet/config.json
ADDED
|
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_class_name": "ControlNetModel",
|
| 3 |
+
"_diffusers_version": "0.21.2",
|
| 4 |
+
"act_fn": "silu",
|
| 5 |
+
"addition_embed_type": null,
|
| 6 |
+
"addition_embed_type_num_heads": 64,
|
| 7 |
+
"addition_time_embed_dim": null,
|
| 8 |
+
"attention_head_dim": [
|
| 9 |
+
5,
|
| 10 |
+
10,
|
| 11 |
+
20,
|
| 12 |
+
20
|
| 13 |
+
],
|
| 14 |
+
"attention_type": "default",
|
| 15 |
+
"augment_temporal_attention": true,
|
| 16 |
+
"block_out_channels": [
|
| 17 |
+
320,
|
| 18 |
+
640,
|
| 19 |
+
1280,
|
| 20 |
+
1280
|
| 21 |
+
],
|
| 22 |
+
"class_embed_type": "identity",
|
| 23 |
+
"class_embeddings_concat": true,
|
| 24 |
+
"conditioning_channels": 3,
|
| 25 |
+
"conditioning_embedding_out_channels": [
|
| 26 |
+
16,
|
| 27 |
+
32,
|
| 28 |
+
96,
|
| 29 |
+
256
|
| 30 |
+
],
|
| 31 |
+
"conv_in_kernel": 3,
|
| 32 |
+
"cross_attention_dim": 1024,
|
| 33 |
+
"cross_attention_norm": null,
|
| 34 |
+
"down_block_types": [
|
| 35 |
+
"CrossAttnDownBlock2D",
|
| 36 |
+
"CrossAttnDownBlock2D",
|
| 37 |
+
"CrossAttnDownBlock2D",
|
| 38 |
+
"DownBlock2D"
|
| 39 |
+
],
|
| 40 |
+
"downsample_padding": 1,
|
| 41 |
+
"dropout": 0.0,
|
| 42 |
+
"dual_cross_attention": false,
|
| 43 |
+
"encoder_hid_dim": null,
|
| 44 |
+
"encoder_hid_dim_type": null,
|
| 45 |
+
"first_frame_condition_mode": "none",
|
| 46 |
+
"flip_sin_to_cos": true,
|
| 47 |
+
"freq_shift": 0,
|
| 48 |
+
"global_pool_conditions": false,
|
| 49 |
+
"in_channels": 4,
|
| 50 |
+
"layers_per_block": 2,
|
| 51 |
+
"mid_block_only_cross_attention": null,
|
| 52 |
+
"mid_block_scale_factor": 1,
|
| 53 |
+
"mid_block_type": "UNetMidBlock2DCrossAttn",
|
| 54 |
+
"n_frames": 16,
|
| 55 |
+
"n_temp_heads": 8,
|
| 56 |
+
"norm_eps": 1e-05,
|
| 57 |
+
"norm_num_groups": 32,
|
| 58 |
+
"num_attention_heads": null,
|
| 59 |
+
"num_class_embeds": null,
|
| 60 |
+
"only_cross_attention": false,
|
| 61 |
+
"projection_class_embeddings_input_dim": null,
|
| 62 |
+
"resnet_out_scale_factor": 1.0,
|
| 63 |
+
"resnet_skip_time_act": false,
|
| 64 |
+
"resnet_time_scale_shift": "default",
|
| 65 |
+
"temp_pos_embedding": "rotary",
|
| 66 |
+
"time_cond_proj_dim": null,
|
| 67 |
+
"time_embedding_act_fn": null,
|
| 68 |
+
"time_embedding_dim": 512,
|
| 69 |
+
"time_embedding_type": "positional",
|
| 70 |
+
"timestep_post_act": null,
|
| 71 |
+
"transformer_layers_per_block": 1,
|
| 72 |
+
"upcast_attention": false,
|
| 73 |
+
"use_frame_stride_condition": false,
|
| 74 |
+
"use_linear_projection": true,
|
| 75 |
+
"use_temporal": true
|
| 76 |
+
}
|
checkpoints/Cataract-1K/controlnet_ximg_graph_vid/checkpoint/controlnet/diffusion_pytorch_model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:859a5b41e7b2044c643f6677da64ed4dd7b1220e2de20c6a29b4b89cabab4ef1
|
| 3 |
+
size 2102262840
|
checkpoints/Cataract-1K/controlnet_ximg_graph_vid/checkpoint/optimizer.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8045adfdaa09aa28b1fa7c556aaa02714d3e0244a78e401e74206b9b5b9c6e69
|
| 3 |
+
size 4165427650
|
checkpoints/Cataract-1K/controlnet_ximg_graph_vid/checkpoint/random_states_0.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9ebb76978b0d6edffa23a13921f66f7f593a0b46ede1ede4cc8dc2e7fd7b3e11
|
| 3 |
+
size 15060
|
checkpoints/Cataract-1K/controlnet_ximg_graph_vid/checkpoint/scheduler.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:532bd73f2b1d01dc0536daabcec58f40e379e94a680763a6e8bc6c9fc3e6d1c8
|
| 3 |
+
size 1000
|
checkpoints/Cataract-1K/diffusion_img_graph_xvid/checkpoint/optimizer.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:415b91ff1fe392da4416455500da6eb8d67edefdaf13408c838df9e03dbdb79e
|
| 3 |
+
size 9847488907
|
checkpoints/Cataract-1K/diffusion_img_graph_xvid/checkpoint/random_states_0.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c063c9a457d2f978404daef7c46ab0170e4a81320bd443c6daf6eb1b2dc483d3
|
| 3 |
+
size 15124
|
checkpoints/Cataract-1K/diffusion_img_graph_xvid/checkpoint/scheduler.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:94f607683c090632e8d779f519774dd0a2133d20ff4299b9edecbd7b1661901e
|
| 3 |
+
size 1000
|
checkpoints/Cataract-1K/diffusion_img_graph_xvid/checkpoint/unet/config.json
ADDED
|
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_class_name": "VideoLDMUNet3DConditionModel",
|
| 3 |
+
"_diffusers_version": "0.21.2",
|
| 4 |
+
"_name_or_path": "/gris/gris-f/homestud/ssivakum/SurgSimBridge_Gen/checkpoints/Cataract-1k/video_diffusion/surgsimbridge_training_img_graph_xvid_cataract1k-2026-03-26T20-30-27/checkpoints/checkpoint-245000",
|
| 5 |
+
"act_fn": "silu",
|
| 6 |
+
"addition_embed_type": null,
|
| 7 |
+
"addition_embed_type_num_heads": 64,
|
| 8 |
+
"addition_time_embed_dim": null,
|
| 9 |
+
"attention_head_dim": [
|
| 10 |
+
5,
|
| 11 |
+
10,
|
| 12 |
+
20,
|
| 13 |
+
20
|
| 14 |
+
],
|
| 15 |
+
"attention_type": "default",
|
| 16 |
+
"augment_temporal_attention": true,
|
| 17 |
+
"block_out_channels": [
|
| 18 |
+
320,
|
| 19 |
+
640,
|
| 20 |
+
1280,
|
| 21 |
+
1280
|
| 22 |
+
],
|
| 23 |
+
"center_input_sample": false,
|
| 24 |
+
"class_embed_type": "identity",
|
| 25 |
+
"class_embeddings_concat": true,
|
| 26 |
+
"conv_in_kernel": 3,
|
| 27 |
+
"conv_out_kernel": 3,
|
| 28 |
+
"cross_attention_dim": 1024,
|
| 29 |
+
"cross_attention_norm": null,
|
| 30 |
+
"down_block_types": [
|
| 31 |
+
"CrossAttnDownBlock2D",
|
| 32 |
+
"CrossAttnDownBlock2D",
|
| 33 |
+
"CrossAttnDownBlock2D",
|
| 34 |
+
"DownBlock2D"
|
| 35 |
+
],
|
| 36 |
+
"downsample_padding": 1,
|
| 37 |
+
"dropout": 0.0,
|
| 38 |
+
"dual_cross_attention": false,
|
| 39 |
+
"encoder_hid_dim": null,
|
| 40 |
+
"encoder_hid_dim_type": null,
|
| 41 |
+
"first_frame_condition_mode": "concat",
|
| 42 |
+
"flip_sin_to_cos": true,
|
| 43 |
+
"freq_shift": 0,
|
| 44 |
+
"in_channels": 4,
|
| 45 |
+
"layers_per_block": 2,
|
| 46 |
+
"mid_block_only_cross_attention": null,
|
| 47 |
+
"mid_block_scale_factor": 1,
|
| 48 |
+
"mid_block_type": "UNetMidBlock2DCrossAttn",
|
| 49 |
+
"n_frames": 16,
|
| 50 |
+
"n_temp_heads": 8,
|
| 51 |
+
"norm_eps": 1e-05,
|
| 52 |
+
"norm_num_groups": 32,
|
| 53 |
+
"num_attention_heads": null,
|
| 54 |
+
"num_class_embeds": null,
|
| 55 |
+
"only_cross_attention": false,
|
| 56 |
+
"out_channels": 4,
|
| 57 |
+
"projection_class_embeddings_input_dim": null,
|
| 58 |
+
"resnet_out_scale_factor": 1.0,
|
| 59 |
+
"resnet_skip_time_act": false,
|
| 60 |
+
"resnet_time_scale_shift": "default",
|
| 61 |
+
"sample_size": 64,
|
| 62 |
+
"temp_pos_embedding": "rotary",
|
| 63 |
+
"time_cond_proj_dim": null,
|
| 64 |
+
"time_embedding_act_fn": null,
|
| 65 |
+
"time_embedding_dim": 512,
|
| 66 |
+
"time_embedding_type": "positional",
|
| 67 |
+
"timestep_post_act": null,
|
| 68 |
+
"transformer_layers_per_block": 1,
|
| 69 |
+
"up_block_types": [
|
| 70 |
+
"UpBlock2D",
|
| 71 |
+
"CrossAttnUpBlock2D",
|
| 72 |
+
"CrossAttnUpBlock2D",
|
| 73 |
+
"CrossAttnUpBlock2D"
|
| 74 |
+
],
|
| 75 |
+
"upcast_attention": false,
|
| 76 |
+
"use_frame_stride_condition": false,
|
| 77 |
+
"use_linear_projection": true,
|
| 78 |
+
"use_temporal": true
|
| 79 |
+
}
|
checkpoints/Cataract-1K/diffusion_img_graph_xvid/checkpoint/unet/diffusion_pytorch_model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8e5fd513ed200886c036a1da211a097784942a73340d9ddde34645aade9b64d8
|
| 3 |
+
size 4964732628
|
checkpoints/Cataract-1K/diffusion_ximg_graph_xvid/checkpoint/optimizer.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:37d6c917c05ab185f9d47db0e4fbb96b1c5ce2d47e0cf3ef4f2c01f427f9e1b4
|
| 3 |
+
size 9847488907
|
checkpoints/Cataract-1K/diffusion_ximg_graph_xvid/checkpoint/random_states_0.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4ce6f7a255ed67162de7f4c95beb6c09c9276622d1c43ff280d3c1c1120500cc
|
| 3 |
+
size 15060
|
checkpoints/Cataract-1K/diffusion_ximg_graph_xvid/checkpoint/scheduler.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f84e5f00e0f88087868886d1779ea0db653ba13f6a46103160bf4e63f9d73045
|
| 3 |
+
size 1000
|
checkpoints/Cataract-1K/diffusion_ximg_graph_xvid/checkpoint/unet/config.json
ADDED
|
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_class_name": "VideoLDMUNet3DConditionModel",
|
| 3 |
+
"_diffusers_version": "0.21.2",
|
| 4 |
+
"_name_or_path": "/gris/gris-f/homestud/ssivakum/SurgSimBridge_Gen/checkpoints/Cataract-1k/video_diffusion/surgsimbridge_training_ximg_graph_xvid_cataract1k-2026-03-29T17-21-00/checkpoints/checkpoint-240000",
|
| 5 |
+
"act_fn": "silu",
|
| 6 |
+
"addition_embed_type": null,
|
| 7 |
+
"addition_embed_type_num_heads": 64,
|
| 8 |
+
"addition_time_embed_dim": null,
|
| 9 |
+
"attention_head_dim": [
|
| 10 |
+
5,
|
| 11 |
+
10,
|
| 12 |
+
20,
|
| 13 |
+
20
|
| 14 |
+
],
|
| 15 |
+
"attention_type": "default",
|
| 16 |
+
"augment_temporal_attention": true,
|
| 17 |
+
"block_out_channels": [
|
| 18 |
+
320,
|
| 19 |
+
640,
|
| 20 |
+
1280,
|
| 21 |
+
1280
|
| 22 |
+
],
|
| 23 |
+
"center_input_sample": false,
|
| 24 |
+
"class_embed_type": "identity",
|
| 25 |
+
"class_embeddings_concat": true,
|
| 26 |
+
"conv_in_kernel": 3,
|
| 27 |
+
"conv_out_kernel": 3,
|
| 28 |
+
"cross_attention_dim": 1024,
|
| 29 |
+
"cross_attention_norm": null,
|
| 30 |
+
"down_block_types": [
|
| 31 |
+
"CrossAttnDownBlock2D",
|
| 32 |
+
"CrossAttnDownBlock2D",
|
| 33 |
+
"CrossAttnDownBlock2D",
|
| 34 |
+
"DownBlock2D"
|
| 35 |
+
],
|
| 36 |
+
"downsample_padding": 1,
|
| 37 |
+
"dropout": 0.0,
|
| 38 |
+
"dual_cross_attention": false,
|
| 39 |
+
"encoder_hid_dim": null,
|
| 40 |
+
"encoder_hid_dim_type": null,
|
| 41 |
+
"first_frame_condition_mode": "none",
|
| 42 |
+
"flip_sin_to_cos": true,
|
| 43 |
+
"freq_shift": 0,
|
| 44 |
+
"in_channels": 4,
|
| 45 |
+
"layers_per_block": 2,
|
| 46 |
+
"mid_block_only_cross_attention": null,
|
| 47 |
+
"mid_block_scale_factor": 1,
|
| 48 |
+
"mid_block_type": "UNetMidBlock2DCrossAttn",
|
| 49 |
+
"n_frames": 16,
|
| 50 |
+
"n_temp_heads": 8,
|
| 51 |
+
"norm_eps": 1e-05,
|
| 52 |
+
"norm_num_groups": 32,
|
| 53 |
+
"num_attention_heads": null,
|
| 54 |
+
"num_class_embeds": null,
|
| 55 |
+
"only_cross_attention": false,
|
| 56 |
+
"out_channels": 4,
|
| 57 |
+
"projection_class_embeddings_input_dim": null,
|
| 58 |
+
"resnet_out_scale_factor": 1.0,
|
| 59 |
+
"resnet_skip_time_act": false,
|
| 60 |
+
"resnet_time_scale_shift": "default",
|
| 61 |
+
"sample_size": 64,
|
| 62 |
+
"temp_pos_embedding": "rotary",
|
| 63 |
+
"time_cond_proj_dim": null,
|
| 64 |
+
"time_embedding_act_fn": null,
|
| 65 |
+
"time_embedding_dim": 512,
|
| 66 |
+
"time_embedding_type": "positional",
|
| 67 |
+
"timestep_post_act": null,
|
| 68 |
+
"transformer_layers_per_block": 1,
|
| 69 |
+
"up_block_types": [
|
| 70 |
+
"UpBlock2D",
|
| 71 |
+
"CrossAttnUpBlock2D",
|
| 72 |
+
"CrossAttnUpBlock2D",
|
| 73 |
+
"CrossAttnUpBlock2D"
|
| 74 |
+
],
|
| 75 |
+
"upcast_attention": false,
|
| 76 |
+
"use_frame_stride_condition": false,
|
| 77 |
+
"use_linear_projection": true,
|
| 78 |
+
"use_temporal": true
|
| 79 |
+
}
|
checkpoints/Cataract-1K/diffusion_ximg_graph_xvid/checkpoint/unet/diffusion_pytorch_model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c7b626c753583df2dcbcab69355cb4769876782d990e97b169168195ad66e2e8
|
| 3 |
+
size 4964732628
|
checkpoints/Cataract-1K/graphencoder_masked/best_val_loss.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:60850527fd42a58b8422c757abfd72198e4b7602d757b74ba14480fc26fa2f86
|
| 3 |
+
size 350312386
|
checkpoints/Cataract-1K/graphencoder_segclip/best_val_loss.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:63e107df17abb7ae93f6e9f0e7c2192b1e0026de83e75c64f8b14d6bf9332e6c
|
| 3 |
+
size 277281262
|
checkpoints/Cataract-1K/vae_vid_diffusion/vae/config.json
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_class_name": "AutoencoderKL",
|
| 3 |
+
"_diffusers_version": "0.31.0",
|
| 4 |
+
"act_fn": "silu",
|
| 5 |
+
"block_out_channels": [
|
| 6 |
+
128,
|
| 7 |
+
256,
|
| 8 |
+
512,
|
| 9 |
+
512
|
| 10 |
+
],
|
| 11 |
+
"down_block_types": [
|
| 12 |
+
"DownEncoderBlock2D",
|
| 13 |
+
"DownEncoderBlock2D",
|
| 14 |
+
"DownEncoderBlock2D",
|
| 15 |
+
"DownEncoderBlock2D"
|
| 16 |
+
],
|
| 17 |
+
"force_upcast": true,
|
| 18 |
+
"in_channels": 3,
|
| 19 |
+
"latent_channels": 4,
|
| 20 |
+
"latents_mean": null,
|
| 21 |
+
"latents_std": null,
|
| 22 |
+
"layers_per_block": 2,
|
| 23 |
+
"mid_block_add_attention": true,
|
| 24 |
+
"norm_num_groups": 32,
|
| 25 |
+
"out_channels": 3,
|
| 26 |
+
"sample_size": 512,
|
| 27 |
+
"scaling_factor": 0.18215,
|
| 28 |
+
"shift_factor": null,
|
| 29 |
+
"up_block_types": [
|
| 30 |
+
"UpDecoderBlock2D",
|
| 31 |
+
"UpDecoderBlock2D",
|
| 32 |
+
"UpDecoderBlock2D",
|
| 33 |
+
"UpDecoderBlock2D"
|
| 34 |
+
],
|
| 35 |
+
"use_post_quant_conv": true,
|
| 36 |
+
"use_quant_conv": true
|
| 37 |
+
}
|
checkpoints/Cataract-1K/vae_vid_diffusion/vae/diffusion_pytorch_model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c092d4ddabf78277ce148cf7cbaf7f115119ef79b352a1a44d86f697f7052d51
|
| 3 |
+
size 334643268
|
checkpoints/Cataract-1K/vqgan_image/checkpoint.ckpt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8b02b93c343804b00d63ea69003490a27a7d3bb61825fbfa374c1a8ba14ff1ed
|
| 3 |
+
size 878907278
|
checkpoints/Cataract-1K/vqgan_image/config.yaml
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
model:
|
| 2 |
+
base_learning_rate: 4.5e-06
|
| 3 |
+
target: surgsimbridge.taming.taming.models.vqgan.VQModel
|
| 4 |
+
params:
|
| 5 |
+
embed_dim: 1
|
| 6 |
+
n_embed: 16384
|
| 7 |
+
image_key: image
|
| 8 |
+
ddconfig:
|
| 9 |
+
double_z: false
|
| 10 |
+
z_channels: 1
|
| 11 |
+
resolution: 128
|
| 12 |
+
in_channels: 3
|
| 13 |
+
out_ch: 3
|
| 14 |
+
ch: 128
|
| 15 |
+
ch_mult:
|
| 16 |
+
- 1
|
| 17 |
+
- 2
|
| 18 |
+
- 2
|
| 19 |
+
- 4
|
| 20 |
+
num_res_blocks: 2
|
| 21 |
+
attn_resolutions:
|
| 22 |
+
- 16
|
| 23 |
+
dropout: 0.0
|
| 24 |
+
lossconfig:
|
| 25 |
+
target: surgsimbridge.taming.taming.modules.losses.vqperceptual.VQLPIPSWithDiscriminator
|
| 26 |
+
params:
|
| 27 |
+
disc_conditional: false
|
| 28 |
+
disc_in_channels: 3
|
| 29 |
+
disc_num_layers: 2
|
| 30 |
+
disc_start: 1
|
| 31 |
+
disc_weight: 0.6
|
| 32 |
+
codebook_weight: 1.0
|
| 33 |
+
data:
|
| 34 |
+
target: main.DataModuleFromConfig
|
| 35 |
+
params:
|
| 36 |
+
batch_size: 16
|
| 37 |
+
num_workers: 16
|
| 38 |
+
train:
|
| 39 |
+
target: surgsimbridge.taming.taming.data.surgicaldataset.CataractTrain
|
| 40 |
+
params:
|
| 41 |
+
size: 128
|
| 42 |
+
num_label: 14
|
| 43 |
+
augment: true
|
| 44 |
+
txt_file:
|
| 45 |
+
- /gris/scratch-gris-filesrv/sharvien/SurgSimBridge/Cataract-1K/splits/train.txt
|
| 46 |
+
data_root:
|
| 47 |
+
- /gris/scratch-gris-filesrv/sharvien/SurgSimBridge/Cataract-1K
|
| 48 |
+
validation:
|
| 49 |
+
target: surgsimbridge.taming.taming.data.surgicaldataset.CataractValidation
|
| 50 |
+
params:
|
| 51 |
+
size: 128
|
| 52 |
+
num_label: 14
|
| 53 |
+
augment: false
|
| 54 |
+
txt_file:
|
| 55 |
+
- /gris/scratch-gris-filesrv/sharvien/SurgSimBridge/Cataract-1K/splits/val.txt
|
| 56 |
+
data_root:
|
| 57 |
+
- /gris/scratch-gris-filesrv/sharvien/SurgSimBridge/Cataract-1K
|
checkpoints/Cataract-1K/vqgan_segmentation/checkpoint.ckpt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:df08b568767b66d718678837320f822184a98d3fd4d1194250cc43b34d77bd17
|
| 3 |
+
size 812256860
|
checkpoints/Cataract-1K/vqgan_segmentation/config.yaml
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
model:
|
| 2 |
+
base_learning_rate: 4.5e-06
|
| 3 |
+
target: surgsimbridge.taming.taming.models.vqgan.VQSegmentationModel
|
| 4 |
+
params:
|
| 5 |
+
embed_dim: 1
|
| 6 |
+
n_embed: 8192
|
| 7 |
+
image_key: segmentation
|
| 8 |
+
ddconfig:
|
| 9 |
+
double_z: false
|
| 10 |
+
z_channels: 1
|
| 11 |
+
resolution: 128
|
| 12 |
+
in_channels: 14
|
| 13 |
+
out_ch: 14
|
| 14 |
+
ch: 128
|
| 15 |
+
ch_mult:
|
| 16 |
+
- 1
|
| 17 |
+
- 2
|
| 18 |
+
- 2
|
| 19 |
+
- 4
|
| 20 |
+
num_res_blocks: 2
|
| 21 |
+
attn_resolutions:
|
| 22 |
+
- 16
|
| 23 |
+
dropout: 0.0
|
| 24 |
+
lossconfig:
|
| 25 |
+
target: surgsimbridge.taming.taming.modules.losses.segmentation.BCELossWithQuant
|
| 26 |
+
params:
|
| 27 |
+
codebook_weight: 1.0
|
| 28 |
+
data:
|
| 29 |
+
target: main.DataModuleFromConfig
|
| 30 |
+
params:
|
| 31 |
+
batch_size: 16
|
| 32 |
+
num_workers: 16
|
| 33 |
+
train:
|
| 34 |
+
target: surgsimbridge.taming.taming.data.surgicaldataset.CataractTrain
|
| 35 |
+
params:
|
| 36 |
+
size: 128
|
| 37 |
+
num_label: 14
|
| 38 |
+
augment: true
|
| 39 |
+
txt_file:
|
| 40 |
+
- /gris/scratch-gris-filesrv/sharvien/SurgSimBridge/Cataract-1K/splits/train.txt
|
| 41 |
+
data_root:
|
| 42 |
+
- /gris/scratch-gris-filesrv/sharvien/SurgSimBridge/Cataract-1K
|
| 43 |
+
validation:
|
| 44 |
+
target: surgsimbridge.taming.taming.data.surgicaldataset.CataractValidation
|
| 45 |
+
params:
|
| 46 |
+
size: 128
|
| 47 |
+
num_label: 14
|
| 48 |
+
augment: false
|
| 49 |
+
txt_file:
|
| 50 |
+
- /gris/scratch-gris-filesrv/sharvien/SurgSimBridge/Cataract-1K/splits/val.txt
|
| 51 |
+
data_root:
|
| 52 |
+
- /gris/scratch-gris-filesrv/sharvien/SurgSimBridge/Cataract-1K
|