SearchingMan commited on
Commit
2bcc764
·
verified ·
1 Parent(s): 7a81490

Upload LiteRT fp32 and int8 exports of Stable Diffusion v1.5

Browse files
README.md CHANGED
@@ -7,7 +7,7 @@ This repository contains a LiteRT/TFLite export of the Hugging Face model `stabl
7
  - `fp32/`: reference export
8
  - `int8/`: mixed bundle with fp32 text encoder fallback, PT2E dynamic int8 UNet, and fp32 VAE fallback
9
 
10
- ## Files per variant
11
 
12
  - `text_encoder.tflite`
13
  - `unet.tflite`
@@ -18,10 +18,13 @@ This repository contains a LiteRT/TFLite export of the Hugging Face model `stabl
18
  - `tokenizer/`
19
  - `scheduler/`
20
  - `configs/`
 
21
  - `conversion_manifest.json`
22
 
23
  ## Notes
24
 
25
  - Stable Diffusion v1.5 is a multi-stage pipeline, so this export is split into submodels.
26
- - Both the `fp32` and `int8` variants are smoke-tested by reloading the serialized LiteRT models and executing inference.
 
 
27
  - The preview images in `preview/` are decoder smoke tests, not final text-to-image samples.
 
7
  - `fp32/`: reference export
8
  - `int8/`: mixed bundle with fp32 text encoder fallback, PT2E dynamic int8 UNet, and fp32 VAE fallback
9
 
10
+ ## Files per exported variant
11
 
12
  - `text_encoder.tflite`
13
  - `unet.tflite`
 
18
  - `tokenizer/`
19
  - `scheduler/`
20
  - `configs/`
21
+ - `configs/text_encoder_runtime_config.json`
22
  - `conversion_manifest.json`
23
 
24
  ## Notes
25
 
26
  - Stable Diffusion v1.5 is a multi-stage pipeline, so this export is split into submodels.
27
+ - The notebook first tries to export the text encoder with INT32 token ids for better GPU delegate compatibility and records the actual exported input dtype per variant.
28
+ - The fp32 bundle is optional debug output; on CPU runtimes it is skipped by default to avoid kernel deaths during fp32 UNet conversion.
29
+ - Both exported variants are smoke-tested by reloading the serialized LiteRT models and executing inference.
30
  - The preview images in `preview/` are decoder smoke tests, not final text-to-image samples.
configs/text_encoder_runtime_config.json ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "tokenizer_dir": "tokenizer",
3
+ "tokenizer_max_length": 77,
4
+ "vocab_size": 49408,
5
+ "preferred_token_dtype": "int32",
6
+ "fallback_token_dtype": "int64",
7
+ "dtype_attempt_order": [
8
+ "int32",
9
+ "int64"
10
+ ],
11
+ "variants": {
12
+ "fp32": {
13
+ "requested_token_dtype": "int32",
14
+ "exported_input_name": "serving_default_args_0",
15
+ "exported_input_dtype": "INT32",
16
+ "exported_input_shape": [
17
+ 1,
18
+ 77
19
+ ],
20
+ "token_range": [
21
+ 267,
22
+ 49407
23
+ ],
24
+ "gpu_delegate_friendly": true
25
+ },
26
+ "int8": {
27
+ "requested_token_dtype": "int32",
28
+ "exported_input_name": "serving_default_args_0",
29
+ "exported_input_dtype": "INT32",
30
+ "exported_input_shape": [
31
+ 1,
32
+ 77
33
+ ],
34
+ "token_range": [
35
+ 267,
36
+ 49407
37
+ ],
38
+ "gpu_delegate_friendly": true
39
+ }
40
+ },
41
+ "notes": "Tokenizer output is external to the exported model. Token IDs are vocabulary indices and are not int8-quantized."
42
+ }
configs/unet_config.json CHANGED
@@ -63,37 +63,37 @@
63
  "cross_attention_norm": null,
64
  "addition_embed_type_num_heads": 64,
65
  "_use_default_values": [
66
- "num_attention_heads",
67
- "cross_attention_norm",
68
- "class_embeddings_concat",
69
  "resnet_out_scale_factor",
 
 
70
  "resnet_time_scale_shift",
 
 
71
  "conv_in_kernel",
72
- "dual_cross_attention",
73
- "num_class_embeds",
74
- "mid_block_only_cross_attention",
75
  "reverse_transformer_layers_per_block",
76
- "time_embedding_act_fn",
77
- "upcast_attention",
78
  "addition_embed_type_num_heads",
79
- "projection_class_embeddings_input_dim",
80
  "only_cross_attention",
81
- "conv_out_kernel",
82
- "mid_block_type",
83
- "dropout",
84
- "encoder_hid_dim_type",
85
- "addition_embed_type",
86
  "addition_time_embed_dim",
87
- "time_embedding_dim",
88
- "resnet_skip_time_act",
89
  "encoder_hid_dim",
90
- "use_linear_projection",
91
- "attention_type",
92
  "transformer_layers_per_block",
93
- "timestep_post_act",
94
- "time_embedding_type",
95
- "class_embed_type",
96
- "time_cond_proj_dim"
 
 
 
 
97
  ],
98
  "_class_name": "UNet2DConditionModel",
99
  "_diffusers_version": "0.6.0",
 
63
  "cross_attention_norm": null,
64
  "addition_embed_type_num_heads": 64,
65
  "_use_default_values": [
66
+ "time_embedding_type",
67
+ "dual_cross_attention",
68
+ "time_embedding_dim",
69
  "resnet_out_scale_factor",
70
+ "use_linear_projection",
71
+ "resnet_skip_time_act",
72
  "resnet_time_scale_shift",
73
+ "addition_embed_type",
74
+ "time_embedding_act_fn",
75
  "conv_in_kernel",
 
 
 
76
  "reverse_transformer_layers_per_block",
77
+ "num_attention_heads",
78
+ "timestep_post_act",
79
  "addition_embed_type_num_heads",
 
80
  "only_cross_attention",
81
+ "attention_type",
82
+ "time_cond_proj_dim",
83
+ "num_class_embeds",
 
 
84
  "addition_time_embed_dim",
85
+ "dropout",
 
86
  "encoder_hid_dim",
87
+ "mid_block_type",
 
88
  "transformer_layers_per_block",
89
+ "encoder_hid_dim_type",
90
+ "conv_out_kernel",
91
+ "cross_attention_norm",
92
+ "class_embeddings_concat",
93
+ "mid_block_only_cross_attention",
94
+ "upcast_attention",
95
+ "projection_class_embeddings_input_dim",
96
+ "class_embed_type"
97
  ],
98
  "_class_name": "UNet2DConditionModel",
99
  "_diffusers_version": "0.6.0",
configs/vae_config.json CHANGED
@@ -33,14 +33,14 @@
33
  "use_post_quant_conv": true,
34
  "mid_block_add_attention": true,
35
  "_use_default_values": [
36
- "mid_block_add_attention",
37
- "latents_mean",
38
  "use_quant_conv",
39
- "force_upcast",
40
- "scaling_factor",
41
  "shift_factor",
 
 
 
 
42
  "use_post_quant_conv",
43
- "latents_std"
44
  ],
45
  "_class_name": "AutoencoderKL",
46
  "_diffusers_version": "0.6.0",
 
33
  "use_post_quant_conv": true,
34
  "mid_block_add_attention": true,
35
  "_use_default_values": [
 
 
36
  "use_quant_conv",
 
 
37
  "shift_factor",
38
+ "mid_block_add_attention",
39
+ "scaling_factor",
40
+ "force_upcast",
41
+ "latents_std",
42
  "use_post_quant_conv",
43
+ "latents_mean"
44
  ],
45
  "_class_name": "AutoencoderKL",
46
  "_diffusers_version": "0.6.0",
conversion_manifest.json CHANGED
@@ -10,5 +10,7 @@
10
  "variants": [
11
  "fp32",
12
  "int8"
13
- ]
 
 
14
  }
 
10
  "variants": [
11
  "fp32",
12
  "int8"
13
+ ],
14
+ "preferred_text_encoder_token_dtype": "int32",
15
+ "text_encoder_runtime_config": "configs/text_encoder_runtime_config.json"
16
  }
fp32/manifest.json CHANGED
@@ -5,6 +5,38 @@
5
  "unet": "unet.tflite",
6
  "vae_decoder": "vae_decoder.tflite"
7
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  "validation": {
9
  "text_encoder": [
10
  {
 
5
  "unet": "unet.tflite",
6
  "vae_decoder": "vae_decoder.tflite"
7
  },
8
+ "text_encoder_export": {
9
+ "selected": {
10
+ "requested_token_dtype": "int32",
11
+ "exported_input_name": "serving_default_args_0",
12
+ "exported_input_dtype": "INT32",
13
+ "exported_input_shape": [
14
+ 1,
15
+ 77
16
+ ],
17
+ "token_range": [
18
+ 267,
19
+ 49407
20
+ ],
21
+ "gpu_delegate_friendly": true
22
+ },
23
+ "attempts": [
24
+ {
25
+ "requested_token_dtype": "int32",
26
+ "exported_input_name": "serving_default_args_0",
27
+ "exported_input_dtype": "INT32",
28
+ "exported_input_shape": [
29
+ 1,
30
+ 77
31
+ ],
32
+ "token_range": [
33
+ 267,
34
+ 49407
35
+ ],
36
+ "gpu_delegate_friendly": true
37
+ }
38
+ ]
39
+ },
40
  "validation": {
41
  "text_encoder": [
42
  {
fp32/text_encoder.tflite CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d54210be973fadd9786d8291d752ad218cbc27816b1b3e10fe458bd421523af6
3
- size 492646116
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:850e180ea0f2851967108b9c4e9214a78c4540d4c0d7400090a0d19d7eb765c4
3
+ size 492644936
fp32/unet.tflite CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:867eda6b8356dce0c5397aca9e3304ad461d3a2dcfdda643e5b8ade249761bc1
3
  size 3439837600
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5587bbd662f3955ba95cfce59c04dbcae44f973680c1a2cc230de93b6ace0eaf
3
  size 3439837600
int8/manifest.json CHANGED
@@ -6,6 +6,38 @@
6
  "unet": "unet.tflite",
7
  "vae_decoder": "vae_decoder.tflite"
8
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  "validation": {
10
  "text_encoder": [
11
  {
 
6
  "unet": "unet.tflite",
7
  "vae_decoder": "vae_decoder.tflite"
8
  },
9
+ "text_encoder_export": {
10
+ "selected": {
11
+ "requested_token_dtype": "int32",
12
+ "exported_input_name": "serving_default_args_0",
13
+ "exported_input_dtype": "INT32",
14
+ "exported_input_shape": [
15
+ 1,
16
+ 77
17
+ ],
18
+ "token_range": [
19
+ 267,
20
+ 49407
21
+ ],
22
+ "gpu_delegate_friendly": true
23
+ },
24
+ "attempts": [
25
+ {
26
+ "requested_token_dtype": "int32",
27
+ "exported_input_name": "serving_default_args_0",
28
+ "exported_input_dtype": "INT32",
29
+ "exported_input_shape": [
30
+ 1,
31
+ 77
32
+ ],
33
+ "token_range": [
34
+ 267,
35
+ 49407
36
+ ],
37
+ "gpu_delegate_friendly": true
38
+ }
39
+ ]
40
+ },
41
  "validation": {
42
  "text_encoder": [
43
  {
int8/text_encoder.tflite CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d54210be973fadd9786d8291d752ad218cbc27816b1b3e10fe458bd421523af6
3
- size 492646116
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:850e180ea0f2851967108b9c4e9214a78c4540d4c0d7400090a0d19d7eb765c4
3
+ size 492644936