Upload LiteRT fp32 and int8 exports of Stable Diffusion v1.5

Browse files

Files changed (10) hide show

README.md +5 -2
configs/text_encoder_runtime_config.json +42 -0
configs/unet_config.json +22 -22
configs/vae_config.json +5 -5
conversion_manifest.json +3 -1
fp32/manifest.json +32 -0
fp32/text_encoder.tflite +2 -2
fp32/unet.tflite +1 -1
int8/manifest.json +32 -0
int8/text_encoder.tflite +2 -2

README.md CHANGED Viewed

@@ -7,7 +7,7 @@ This repository contains a LiteRT/TFLite export of the Hugging Face model `stabl
 - `fp32/`: reference export
 - `int8/`: mixed bundle with fp32 text encoder fallback, PT2E dynamic int8 UNet, and fp32 VAE fallback
-## Files per variant
 - `text_encoder.tflite`
 - `unet.tflite`
@@ -18,10 +18,13 @@ This repository contains a LiteRT/TFLite export of the Hugging Face model `stabl
 - `tokenizer/`
 - `scheduler/`
 - `configs/`
 - `conversion_manifest.json`
 ## Notes
 - Stable Diffusion v1.5 is a multi-stage pipeline, so this export is split into submodels.
-- Both the `fp32` and `int8` variants are smoke-tested by reloading the serialized LiteRT models and executing inference.
 - The preview images in `preview/` are decoder smoke tests, not final text-to-image samples.

 - `fp32/`: reference export
 - `int8/`: mixed bundle with fp32 text encoder fallback, PT2E dynamic int8 UNet, and fp32 VAE fallback
+## Files per exported variant
 - `text_encoder.tflite`
 - `unet.tflite`
 - `tokenizer/`
 - `scheduler/`
 - `configs/`
+- `configs/text_encoder_runtime_config.json`
 - `conversion_manifest.json`
 ## Notes
 - Stable Diffusion v1.5 is a multi-stage pipeline, so this export is split into submodels.
+- The notebook first tries to export the text encoder with INT32 token ids for better GPU delegate compatibility and records the actual exported input dtype per variant.
+- The fp32 bundle is optional debug output; on CPU runtimes it is skipped by default to avoid kernel deaths during fp32 UNet conversion.
+- Both exported variants are smoke-tested by reloading the serialized LiteRT models and executing inference.
 - The preview images in `preview/` are decoder smoke tests, not final text-to-image samples.

configs/text_encoder_runtime_config.json ADDED Viewed

	@@ -0,0 +1,42 @@

+{
+  "tokenizer_dir": "tokenizer",
+  "tokenizer_max_length": 77,
+  "vocab_size": 49408,
+  "preferred_token_dtype": "int32",
+  "fallback_token_dtype": "int64",
+  "dtype_attempt_order": [
+    "int32",
+    "int64"
+  ],
+  "variants": {
+    "fp32": {
+      "requested_token_dtype": "int32",
+      "exported_input_name": "serving_default_args_0",
+      "exported_input_dtype": "INT32",
+      "exported_input_shape": [
+        1,
+        77
+      ],
+      "token_range": [
+        267,
+        49407
+      ],
+      "gpu_delegate_friendly": true
+    },
+    "int8": {
+      "requested_token_dtype": "int32",
+      "exported_input_name": "serving_default_args_0",
+      "exported_input_dtype": "INT32",
+      "exported_input_shape": [
+        1,
+        77
+      ],
+      "token_range": [
+        267,
+        49407
+      ],
+      "gpu_delegate_friendly": true
+    }
+  },
+  "notes": "Tokenizer output is external to the exported model. Token IDs are vocabulary indices and are not int8-quantized."
+}

configs/unet_config.json CHANGED Viewed

@@ -63,37 +63,37 @@
   "cross_attention_norm": null,
   "addition_embed_type_num_heads": 64,
   "_use_default_values": [
-    "num_attention_heads",
-    "cross_attention_norm",
-    "class_embeddings_concat",
     "resnet_out_scale_factor",
     "resnet_time_scale_shift",
     "conv_in_kernel",
-    "dual_cross_attention",
-    "num_class_embeds",
-    "mid_block_only_cross_attention",
     "reverse_transformer_layers_per_block",
-    "time_embedding_act_fn",
-    "upcast_attention",
     "addition_embed_type_num_heads",
-    "projection_class_embeddings_input_dim",
     "only_cross_attention",
-    "conv_out_kernel",
-    "mid_block_type",
-    "dropout",
-    "encoder_hid_dim_type",
-    "addition_embed_type",
     "addition_time_embed_dim",
-    "time_embedding_dim",
-    "resnet_skip_time_act",
     "encoder_hid_dim",
-    "use_linear_projection",
-    "attention_type",
     "transformer_layers_per_block",
-    "timestep_post_act",
-    "time_embedding_type",
-    "class_embed_type",
-    "time_cond_proj_dim"
   ],
   "_class_name": "UNet2DConditionModel",
   "_diffusers_version": "0.6.0",

   "cross_attention_norm": null,
   "addition_embed_type_num_heads": 64,
   "_use_default_values": [
+    "time_embedding_type",
+    "dual_cross_attention",
+    "time_embedding_dim",
     "resnet_out_scale_factor",
+    "use_linear_projection",
+    "resnet_skip_time_act",
     "resnet_time_scale_shift",
+    "addition_embed_type",
+    "time_embedding_act_fn",
     "conv_in_kernel",
     "reverse_transformer_layers_per_block",
+    "num_attention_heads",
+    "timestep_post_act",
     "addition_embed_type_num_heads",
     "only_cross_attention",
+    "attention_type",
+    "time_cond_proj_dim",
+    "num_class_embeds",
     "addition_time_embed_dim",
+    "dropout",
     "encoder_hid_dim",
+    "mid_block_type",
     "transformer_layers_per_block",
+    "encoder_hid_dim_type",
+    "conv_out_kernel",
+    "cross_attention_norm",
+    "class_embeddings_concat",
+    "mid_block_only_cross_attention",
+    "upcast_attention",
+    "projection_class_embeddings_input_dim",
+    "class_embed_type"
   ],
   "_class_name": "UNet2DConditionModel",
   "_diffusers_version": "0.6.0",

configs/vae_config.json CHANGED Viewed

@@ -33,14 +33,14 @@
   "use_post_quant_conv": true,
   "mid_block_add_attention": true,
   "_use_default_values": [
-    "mid_block_add_attention",
-    "latents_mean",
     "use_quant_conv",
-    "force_upcast",
-    "scaling_factor",
     "shift_factor",
     "use_post_quant_conv",
-    "latents_std"
   ],
   "_class_name": "AutoencoderKL",
   "_diffusers_version": "0.6.0",

   "use_post_quant_conv": true,
   "mid_block_add_attention": true,
   "_use_default_values": [
     "use_quant_conv",
     "shift_factor",
+    "mid_block_add_attention",
+    "scaling_factor",
+    "force_upcast",
+    "latents_std",
     "use_post_quant_conv",
+    "latents_mean"
   ],
   "_class_name": "AutoencoderKL",
   "_diffusers_version": "0.6.0",

conversion_manifest.json CHANGED Viewed

@@ -10,5 +10,7 @@
   "variants": [
     "fp32",
     "int8"
-  ]
 }

   "variants": [
     "fp32",
     "int8"
+  ],
+  "preferred_text_encoder_token_dtype": "int32",
+  "text_encoder_runtime_config": "configs/text_encoder_runtime_config.json"
 }

fp32/manifest.json CHANGED Viewed

@@ -5,6 +5,38 @@
     "unet": "unet.tflite",
     "vae_decoder": "vae_decoder.tflite"
   },
   "validation": {
     "text_encoder": [
       {

     "unet": "unet.tflite",
     "vae_decoder": "vae_decoder.tflite"
   },
+  "text_encoder_export": {
+    "selected": {
+      "requested_token_dtype": "int32",
+      "exported_input_name": "serving_default_args_0",
+      "exported_input_dtype": "INT32",
+      "exported_input_shape": [
+        1,
+        77
+      ],
+      "token_range": [
+        267,
+        49407
+      ],
+      "gpu_delegate_friendly": true
+    },
+    "attempts": [
+      {
+        "requested_token_dtype": "int32",
+        "exported_input_name": "serving_default_args_0",
+        "exported_input_dtype": "INT32",
+        "exported_input_shape": [
+          1,
+          77
+        ],
+        "token_range": [
+          267,
+          49407
+        ],
+        "gpu_delegate_friendly": true
+      }
+    ]
+  },
   "validation": {
     "text_encoder": [
       {

fp32/text_encoder.tflite CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d54210be973fadd9786d8291d752ad218cbc27816b1b3e10fe458bd421523af6
-size 492646116

 version https://git-lfs.github.com/spec/v1
+oid sha256:850e180ea0f2851967108b9c4e9214a78c4540d4c0d7400090a0d19d7eb765c4
+size 492644936

fp32/unet.tflite CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:867eda6b8356dce0c5397aca9e3304ad461d3a2dcfdda643e5b8ade249761bc1
 size 3439837600

 version https://git-lfs.github.com/spec/v1
+oid sha256:5587bbd662f3955ba95cfce59c04dbcae44f973680c1a2cc230de93b6ace0eaf
 size 3439837600

int8/manifest.json CHANGED Viewed

@@ -6,6 +6,38 @@
     "unet": "unet.tflite",
     "vae_decoder": "vae_decoder.tflite"
   },
   "validation": {
     "text_encoder": [
       {

     "unet": "unet.tflite",
     "vae_decoder": "vae_decoder.tflite"
   },
+  "text_encoder_export": {
+    "selected": {
+      "requested_token_dtype": "int32",
+      "exported_input_name": "serving_default_args_0",
+      "exported_input_dtype": "INT32",
+      "exported_input_shape": [
+        1,
+        77
+      ],
+      "token_range": [
+        267,
+        49407
+      ],
+      "gpu_delegate_friendly": true
+    },
+    "attempts": [
+      {
+        "requested_token_dtype": "int32",
+        "exported_input_name": "serving_default_args_0",
+        "exported_input_dtype": "INT32",
+        "exported_input_shape": [
+          1,
+          77
+        ],
+        "token_range": [
+          267,
+          49407
+        ],
+        "gpu_delegate_friendly": true
+      }
+    ]
+  },
   "validation": {
     "text_encoder": [
       {

int8/text_encoder.tflite CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d54210be973fadd9786d8291d752ad218cbc27816b1b3e10fe458bd421523af6
-size 492646116

 version https://git-lfs.github.com/spec/v1
+oid sha256:850e180ea0f2851967108b9c4e9214a78c4540d4c0d7400090a0d19d7eb765c4
+size 492644936