Upload LiteRT Stable Diffusion v1.5 exports with Android/iOS deployment profiles

Browse files

Files changed (8) hide show

README.md +15 -5
configs/text_encoder_runtime_config.json +64 -0
configs/unet_config.json +22 -22
configs/vae_config.json +3 -3
conversion_manifest.json +74 -0
fp32/manifest.json +4 -0
fp32/unet.tflite +1 -1
int8/manifest.json +4 -0

README.md CHANGED Viewed

@@ -2,12 +2,21 @@
 This repository contains a LiteRT/TFLite export of the Hugging Face model `stable-diffusion-v1-5/stable-diffusion-v1-5`.
-## Variants
-- `fp32/`: reference export
 - `int8/`: mixed bundle with fp32 text encoder fallback, PT2E dynamic int8 UNet, and fp32 VAE fallback
-## Files per exported variant
 - `text_encoder.tflite`
 - `unet.tflite`
@@ -24,7 +33,8 @@ This repository contains a LiteRT/TFLite export of the Hugging Face model `stabl
 ## Notes
 - Stable Diffusion v1.5 is a multi-stage pipeline, so this export is split into submodels.
-- The notebook first tries to export the text encoder with INT32 token ids for better GPU delegate compatibility and records the actual exported input dtype per variant.
 - The fp32 bundle is optional debug output; on CPU runtimes it is skipped by default to avoid kernel deaths during fp32 UNet conversion.
-- Both exported variants are smoke-tested by reloading the serialized LiteRT models and executing inference.
 - The preview images in `preview/` are decoder smoke tests, not final text-to-image samples.

 This repository contains a LiteRT/TFLite export of the Hugging Face model `stable-diffusion-v1-5/stable-diffusion-v1-5`.
+## Base variants
+- `fp32/`: reference float export used by `android-gpu` and `ios-coreml`
 - `int8/`: mixed bundle with fp32 text encoder fallback, PT2E dynamic int8 UNet, and fp32 VAE fallback
+## Deployment profiles
+- `android-qnn-npu`: LiteRT Qualcomm AI Engine Direct (QNN) (android, preferred accelerator=NPU)
+- `android-gpu`: LiteRT GPU delegate (android, preferred accelerator=GPU)
+- `android-cpu`: LiteRT CPU/XNNPACK (android, preferred accelerator=CPU)
+- `ios-coreml`: LiteRT Core ML delegate (ios, preferred accelerator=CORE_ML)
+Profiles are emitted in `conversion_manifest.json` as manifest-level mappings onto the exported base variants. This avoids duplicating large model binaries while still letting each runtime pick backend-specific artifacts.
+## Files per exported base variant
 - `text_encoder.tflite`
 - `unet.tflite`
 ## Notes
 - Stable Diffusion v1.5 is a multi-stage pipeline, so this export is split into submodels.
+- The notebook first tries to export the text encoder with INT32 token ids for better GPU/Core ML delegate compatibility and records the actual exported input dtype per variant and per deployment profile.
 - The fp32 bundle is optional debug output; on CPU runtimes it is skipped by default to avoid kernel deaths during fp32 UNet conversion.
+- `android-qnn-npu` is a LiteRT/QNN-oriented deployment profile, not a Qualcomm AOT context binary.
+- Both exported base variants are smoke-tested by reloading the serialized LiteRT models and executing inference.
 - The preview images in `preview/` are decoder smoke tests, not final text-to-image samples.

configs/text_encoder_runtime_config.json CHANGED Viewed

@@ -36,6 +36,70 @@
         49407
       ],
       "gpu_delegate_friendly": true
     }
   },
   "notes": "Tokenizer output is external to the exported model. Token IDs are vocabulary indices and are not int8-quantized."

         49407
       ],
       "gpu_delegate_friendly": true
+    },
+    "android-qnn-npu": {
+      "requested_token_dtype": "int32",
+      "exported_input_name": "serving_default_args_0",
+      "exported_input_dtype": "INT32",
+      "exported_input_shape": [
+        1,
+        77
+      ],
+      "token_range": [
+        267,
+        49407
+      ],
+      "gpu_delegate_friendly": true,
+      "source_variant": "fp32",
+      "profile_name": "android-qnn-npu"
+    },
+    "android-cpu": {
+      "requested_token_dtype": "int32",
+      "exported_input_name": "serving_default_args_0",
+      "exported_input_dtype": "INT32",
+      "exported_input_shape": [
+        1,
+        77
+      ],
+      "token_range": [
+        267,
+        49407
+      ],
+      "gpu_delegate_friendly": true,
+      "source_variant": "fp32",
+      "profile_name": "android-cpu"
+    },
+    "android-gpu": {
+      "requested_token_dtype": "int32",
+      "exported_input_name": "serving_default_args_0",
+      "exported_input_dtype": "INT32",
+      "exported_input_shape": [
+        1,
+        77
+      ],
+      "token_range": [
+        267,
+        49407
+      ],
+      "gpu_delegate_friendly": true,
+      "source_variant": "fp32",
+      "profile_name": "android-gpu"
+    },
+    "ios-coreml": {
+      "requested_token_dtype": "int32",
+      "exported_input_name": "serving_default_args_0",
+      "exported_input_dtype": "INT32",
+      "exported_input_shape": [
+        1,
+        77
+      ],
+      "token_range": [
+        267,
+        49407
+      ],
+      "gpu_delegate_friendly": true,
+      "source_variant": "fp32",
+      "profile_name": "ios-coreml"
     }
   },
   "notes": "Tokenizer output is external to the exported model. Token IDs are vocabulary indices and are not int8-quantized."

configs/unet_config.json CHANGED Viewed

@@ -63,37 +63,37 @@
   "cross_attention_norm": null,
   "addition_embed_type_num_heads": 64,
   "_use_default_values": [
-    "time_embedding_type",
-    "dual_cross_attention",
-    "time_embedding_dim",
-    "resnet_out_scale_factor",
-    "use_linear_projection",
     "resnet_skip_time_act",
     "resnet_time_scale_shift",
-    "addition_embed_type",
-    "time_embedding_act_fn",
-    "conv_in_kernel",
-    "reverse_transformer_layers_per_block",
-    "num_attention_heads",
-    "timestep_post_act",
-    "addition_embed_type_num_heads",
-    "only_cross_attention",
-    "attention_type",
-    "time_cond_proj_dim",
-    "num_class_embeds",
-    "addition_time_embed_dim",
     "dropout",
     "encoder_hid_dim",
     "mid_block_type",
     "transformer_layers_per_block",
-    "encoder_hid_dim_type",
-    "conv_out_kernel",
-    "cross_attention_norm",
-    "class_embeddings_concat",
     "mid_block_only_cross_attention",
     "upcast_attention",
     "projection_class_embeddings_input_dim",
-    "class_embed_type"
   ],
   "_class_name": "UNet2DConditionModel",
   "_diffusers_version": "0.6.0",

   "cross_attention_norm": null,
   "addition_embed_type_num_heads": 64,
   "_use_default_values": [
+    "encoder_hid_dim_type",
+    "class_embeddings_concat",
     "resnet_skip_time_act",
     "resnet_time_scale_shift",
     "dropout",
     "encoder_hid_dim",
+    "conv_out_kernel",
     "mid_block_type",
+    "time_embedding_type",
+    "num_class_embeds",
+    "addition_embed_type",
     "transformer_layers_per_block",
     "mid_block_only_cross_attention",
+    "use_linear_projection",
+    "time_embedding_act_fn",
     "upcast_attention",
+    "time_embedding_dim",
+    "time_cond_proj_dim",
+    "class_embed_type",
+    "reverse_transformer_layers_per_block",
+    "only_cross_attention",
+    "attention_type",
+    "dual_cross_attention",
     "projection_class_embeddings_input_dim",
+    "num_attention_heads",
+    "addition_embed_type_num_heads",
+    "addition_time_embed_dim",
+    "timestep_post_act",
+    "conv_in_kernel",
+    "cross_attention_norm",
+    "resnet_out_scale_factor"
   ],
   "_class_name": "UNet2DConditionModel",
   "_diffusers_version": "0.6.0",

configs/vae_config.json CHANGED Viewed

@@ -34,12 +34,12 @@
   "mid_block_add_attention": true,
   "_use_default_values": [
     "use_quant_conv",
-    "shift_factor",
     "mid_block_add_attention",
     "scaling_factor",
     "force_upcast",
-    "latents_std",
-    "use_post_quant_conv",
     "latents_mean"
   ],
   "_class_name": "AutoencoderKL",

   "mid_block_add_attention": true,
   "_use_default_values": [
     "use_quant_conv",
+    "latents_std",
+    "use_post_quant_conv",
     "mid_block_add_attention",
     "scaling_factor",
     "force_upcast",
+    "shift_factor",
     "latents_mean"
   ],
   "_class_name": "AutoencoderKL",

conversion_manifest.json CHANGED Viewed

@@ -11,6 +11,80 @@
     "fp32",
     "int8"
   ],
   "preferred_text_encoder_token_dtype": "int32",
   "text_encoder_runtime_config": "configs/text_encoder_runtime_config.json"
 }

     "fp32",
     "int8"
   ],
+  "profiles": {
+    "android-qnn-npu": {
+      "platform": "android",
+      "preferred_accelerator": "NPU",
+      "delegate": "LiteRT Qualcomm AI Engine Direct (QNN)",
+      "notes": [
+        "Mixed deployment profile for the Qualcomm NPU path through LiteRT CompiledModel.",
+        "This notebook still exports LiteRT/TFLite submodels, not Qualcomm-specific AOT context binaries.",
+        "Android packaging still needs Qualcomm LiteRT runtime libraries and arm64-v8a delivery."
+      ],
+      "source_variant": "int8",
+      "files": {
+        "text_encoder": "fp32/text_encoder.tflite",
+        "unet": "int8/unet.tflite",
+        "vae_decoder": "fp32/vae_decoder.tflite"
+      },
+      "quantization": "fp32 text encoder + dynamic int8 UNet + fp32 VAE"
+    },
+    "android-cpu": {
+      "platform": "android",
+      "preferred_accelerator": "CPU",
+      "delegate": "LiteRT CPU/XNNPACK",
+      "notes": [
+        "Conservative fallback profile for Android when GPU/NPU compilation is unavailable.",
+        "Reuses the mixed int8 UNet path for smaller downloads and lower RAM pressure."
+      ],
+      "source_variant": "int8",
+      "files": {
+        "text_encoder": "fp32/text_encoder.tflite",
+        "unet": "int8/unet.tflite",
+        "vae_decoder": "fp32/vae_decoder.tflite"
+      },
+      "quantization": "fp32 text encoder + dynamic int8 UNet + fp32 VAE"
+    },
+    "android-gpu": {
+      "platform": "android",
+      "preferred_accelerator": "GPU",
+      "delegate": "LiteRT GPU delegate",
+      "notes": [
+        "Uses the float export path because LiteRT GPU delegates are the most predictable there.",
+        "The text encoder still prefers INT32 token ids to avoid delegate-hostile INT64 input graphs."
+      ],
+      "source_variant": "fp32",
+      "files": {
+        "text_encoder": "fp32/text_encoder.tflite",
+        "unet": "fp32/unet.tflite",
+        "vae_decoder": "fp32/vae_decoder.tflite"
+      },
+      "quantization": "fp32"
+    },
+    "ios-coreml": {
+      "platform": "ios",
+      "preferred_accelerator": "CORE_ML",
+      "delegate": "LiteRT Core ML delegate",
+      "notes": [
+        "Core ML delegate currently supports float models, so this profile stays on the float export path.",
+        "This notebook exports LiteRT/TFLite artifacts for the LiteRT Core ML delegate, not native `.mlmodel` files."
+      ],
+      "source_variant": "fp32",
+      "files": {
+        "text_encoder": "fp32/text_encoder.tflite",
+        "unet": "fp32/unet.tflite",
+        "vae_decoder": "fp32/vae_decoder.tflite"
+      },
+      "quantization": "fp32",
+      "minimum_os": "iOS 12"
+    }
+  },
+  "android_profile_priority": {
+    "GPU": "android-gpu",
+    "NPU": "android-qnn-npu",
+    "CPU": "android-cpu"
+  },
+  "legacy_default_variant": "int8",
   "preferred_text_encoder_token_dtype": "int32",
   "text_encoder_runtime_config": "configs/text_encoder_runtime_config.json"
 }

fp32/manifest.json CHANGED Viewed

@@ -5,6 +5,10 @@
     "unet": "unet.tflite",
     "vae_decoder": "vae_decoder.tflite"
   },
   "text_encoder_export": {
     "selected": {
       "requested_token_dtype": "int32",

     "unet": "unet.tflite",
     "vae_decoder": "vae_decoder.tflite"
   },
+  "deployment_profiles": [
+    "android-gpu",
+    "ios-coreml"
+  ],
   "text_encoder_export": {
     "selected": {
       "requested_token_dtype": "int32",

fp32/unet.tflite CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5587bbd662f3955ba95cfce59c04dbcae44f973680c1a2cc230de93b6ace0eaf
 size 3439837600

 version https://git-lfs.github.com/spec/v1
+oid sha256:afd9fe4f2203ae292b756154ffc8d0f42aac64fa0a92c6faff8d61c7ee6b3ef8
 size 3439837600

int8/manifest.json CHANGED Viewed

@@ -6,6 +6,10 @@
     "unet": "unet.tflite",
     "vae_decoder": "vae_decoder.tflite"
   },
   "text_encoder_export": {
     "selected": {
       "requested_token_dtype": "int32",

     "unet": "unet.tflite",
     "vae_decoder": "vae_decoder.tflite"
   },
+  "deployment_profiles": [
+    "android-qnn-npu",
+    "android-cpu"
+  ],
   "text_encoder_export": {
     "selected": {
       "requested_token_dtype": "int32",