SearchingMan commited on
Commit
7b0cd98
·
verified ·
1 Parent(s): 2bcc764

Upload LiteRT Stable Diffusion v1.5 exports with Android/iOS deployment profiles

Browse files
README.md CHANGED
@@ -2,12 +2,21 @@
2
 
3
  This repository contains a LiteRT/TFLite export of the Hugging Face model `stable-diffusion-v1-5/stable-diffusion-v1-5`.
4
 
5
- ## Variants
6
 
7
- - `fp32/`: reference export
8
  - `int8/`: mixed bundle with fp32 text encoder fallback, PT2E dynamic int8 UNet, and fp32 VAE fallback
9
 
10
- ## Files per exported variant
 
 
 
 
 
 
 
 
 
11
 
12
  - `text_encoder.tflite`
13
  - `unet.tflite`
@@ -24,7 +33,8 @@ This repository contains a LiteRT/TFLite export of the Hugging Face model `stabl
24
  ## Notes
25
 
26
  - Stable Diffusion v1.5 is a multi-stage pipeline, so this export is split into submodels.
27
- - The notebook first tries to export the text encoder with INT32 token ids for better GPU delegate compatibility and records the actual exported input dtype per variant.
28
  - The fp32 bundle is optional debug output; on CPU runtimes it is skipped by default to avoid kernel deaths during fp32 UNet conversion.
29
- - Both exported variants are smoke-tested by reloading the serialized LiteRT models and executing inference.
 
30
  - The preview images in `preview/` are decoder smoke tests, not final text-to-image samples.
 
2
 
3
  This repository contains a LiteRT/TFLite export of the Hugging Face model `stable-diffusion-v1-5/stable-diffusion-v1-5`.
4
 
5
+ ## Base variants
6
 
7
+ - `fp32/`: reference float export used by `android-gpu` and `ios-coreml`
8
  - `int8/`: mixed bundle with fp32 text encoder fallback, PT2E dynamic int8 UNet, and fp32 VAE fallback
9
 
10
+ ## Deployment profiles
11
+
12
+ - `android-qnn-npu`: LiteRT Qualcomm AI Engine Direct (QNN) (android, preferred accelerator=NPU)
13
+ - `android-gpu`: LiteRT GPU delegate (android, preferred accelerator=GPU)
14
+ - `android-cpu`: LiteRT CPU/XNNPACK (android, preferred accelerator=CPU)
15
+ - `ios-coreml`: LiteRT Core ML delegate (ios, preferred accelerator=CORE_ML)
16
+
17
+ Profiles are emitted in `conversion_manifest.json` as manifest-level mappings onto the exported base variants. This avoids duplicating large model binaries while still letting each runtime pick backend-specific artifacts.
18
+
19
+ ## Files per exported base variant
20
 
21
  - `text_encoder.tflite`
22
  - `unet.tflite`
 
33
  ## Notes
34
 
35
  - Stable Diffusion v1.5 is a multi-stage pipeline, so this export is split into submodels.
36
+ - The notebook first tries to export the text encoder with INT32 token ids for better GPU/Core ML delegate compatibility and records the actual exported input dtype per variant and per deployment profile.
37
  - The fp32 bundle is optional debug output; on CPU runtimes it is skipped by default to avoid kernel deaths during fp32 UNet conversion.
38
+ - `android-qnn-npu` is a LiteRT/QNN-oriented deployment profile, not a Qualcomm AOT context binary.
39
+ - Both exported base variants are smoke-tested by reloading the serialized LiteRT models and executing inference.
40
  - The preview images in `preview/` are decoder smoke tests, not final text-to-image samples.
configs/text_encoder_runtime_config.json CHANGED
@@ -36,6 +36,70 @@
36
  49407
37
  ],
38
  "gpu_delegate_friendly": true
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  }
40
  },
41
  "notes": "Tokenizer output is external to the exported model. Token IDs are vocabulary indices and are not int8-quantized."
 
36
  49407
37
  ],
38
  "gpu_delegate_friendly": true
39
+ },
40
+ "android-qnn-npu": {
41
+ "requested_token_dtype": "int32",
42
+ "exported_input_name": "serving_default_args_0",
43
+ "exported_input_dtype": "INT32",
44
+ "exported_input_shape": [
45
+ 1,
46
+ 77
47
+ ],
48
+ "token_range": [
49
+ 267,
50
+ 49407
51
+ ],
52
+ "gpu_delegate_friendly": true,
53
+ "source_variant": "fp32",
54
+ "profile_name": "android-qnn-npu"
55
+ },
56
+ "android-cpu": {
57
+ "requested_token_dtype": "int32",
58
+ "exported_input_name": "serving_default_args_0",
59
+ "exported_input_dtype": "INT32",
60
+ "exported_input_shape": [
61
+ 1,
62
+ 77
63
+ ],
64
+ "token_range": [
65
+ 267,
66
+ 49407
67
+ ],
68
+ "gpu_delegate_friendly": true,
69
+ "source_variant": "fp32",
70
+ "profile_name": "android-cpu"
71
+ },
72
+ "android-gpu": {
73
+ "requested_token_dtype": "int32",
74
+ "exported_input_name": "serving_default_args_0",
75
+ "exported_input_dtype": "INT32",
76
+ "exported_input_shape": [
77
+ 1,
78
+ 77
79
+ ],
80
+ "token_range": [
81
+ 267,
82
+ 49407
83
+ ],
84
+ "gpu_delegate_friendly": true,
85
+ "source_variant": "fp32",
86
+ "profile_name": "android-gpu"
87
+ },
88
+ "ios-coreml": {
89
+ "requested_token_dtype": "int32",
90
+ "exported_input_name": "serving_default_args_0",
91
+ "exported_input_dtype": "INT32",
92
+ "exported_input_shape": [
93
+ 1,
94
+ 77
95
+ ],
96
+ "token_range": [
97
+ 267,
98
+ 49407
99
+ ],
100
+ "gpu_delegate_friendly": true,
101
+ "source_variant": "fp32",
102
+ "profile_name": "ios-coreml"
103
  }
104
  },
105
  "notes": "Tokenizer output is external to the exported model. Token IDs are vocabulary indices and are not int8-quantized."
configs/unet_config.json CHANGED
@@ -63,37 +63,37 @@
63
  "cross_attention_norm": null,
64
  "addition_embed_type_num_heads": 64,
65
  "_use_default_values": [
66
- "time_embedding_type",
67
- "dual_cross_attention",
68
- "time_embedding_dim",
69
- "resnet_out_scale_factor",
70
- "use_linear_projection",
71
  "resnet_skip_time_act",
72
  "resnet_time_scale_shift",
73
- "addition_embed_type",
74
- "time_embedding_act_fn",
75
- "conv_in_kernel",
76
- "reverse_transformer_layers_per_block",
77
- "num_attention_heads",
78
- "timestep_post_act",
79
- "addition_embed_type_num_heads",
80
- "only_cross_attention",
81
- "attention_type",
82
- "time_cond_proj_dim",
83
- "num_class_embeds",
84
- "addition_time_embed_dim",
85
  "dropout",
86
  "encoder_hid_dim",
 
87
  "mid_block_type",
 
 
 
88
  "transformer_layers_per_block",
89
- "encoder_hid_dim_type",
90
- "conv_out_kernel",
91
- "cross_attention_norm",
92
- "class_embeddings_concat",
93
  "mid_block_only_cross_attention",
 
 
94
  "upcast_attention",
 
 
 
 
 
 
 
95
  "projection_class_embeddings_input_dim",
96
- "class_embed_type"
 
 
 
 
 
 
97
  ],
98
  "_class_name": "UNet2DConditionModel",
99
  "_diffusers_version": "0.6.0",
 
63
  "cross_attention_norm": null,
64
  "addition_embed_type_num_heads": 64,
65
  "_use_default_values": [
66
+ "encoder_hid_dim_type",
67
+ "class_embeddings_concat",
 
 
 
68
  "resnet_skip_time_act",
69
  "resnet_time_scale_shift",
 
 
 
 
 
 
 
 
 
 
 
 
70
  "dropout",
71
  "encoder_hid_dim",
72
+ "conv_out_kernel",
73
  "mid_block_type",
74
+ "time_embedding_type",
75
+ "num_class_embeds",
76
+ "addition_embed_type",
77
  "transformer_layers_per_block",
 
 
 
 
78
  "mid_block_only_cross_attention",
79
+ "use_linear_projection",
80
+ "time_embedding_act_fn",
81
  "upcast_attention",
82
+ "time_embedding_dim",
83
+ "time_cond_proj_dim",
84
+ "class_embed_type",
85
+ "reverse_transformer_layers_per_block",
86
+ "only_cross_attention",
87
+ "attention_type",
88
+ "dual_cross_attention",
89
  "projection_class_embeddings_input_dim",
90
+ "num_attention_heads",
91
+ "addition_embed_type_num_heads",
92
+ "addition_time_embed_dim",
93
+ "timestep_post_act",
94
+ "conv_in_kernel",
95
+ "cross_attention_norm",
96
+ "resnet_out_scale_factor"
97
  ],
98
  "_class_name": "UNet2DConditionModel",
99
  "_diffusers_version": "0.6.0",
configs/vae_config.json CHANGED
@@ -34,12 +34,12 @@
34
  "mid_block_add_attention": true,
35
  "_use_default_values": [
36
  "use_quant_conv",
37
- "shift_factor",
 
38
  "mid_block_add_attention",
39
  "scaling_factor",
40
  "force_upcast",
41
- "latents_std",
42
- "use_post_quant_conv",
43
  "latents_mean"
44
  ],
45
  "_class_name": "AutoencoderKL",
 
34
  "mid_block_add_attention": true,
35
  "_use_default_values": [
36
  "use_quant_conv",
37
+ "latents_std",
38
+ "use_post_quant_conv",
39
  "mid_block_add_attention",
40
  "scaling_factor",
41
  "force_upcast",
42
+ "shift_factor",
 
43
  "latents_mean"
44
  ],
45
  "_class_name": "AutoencoderKL",
conversion_manifest.json CHANGED
@@ -11,6 +11,80 @@
11
  "fp32",
12
  "int8"
13
  ],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  "preferred_text_encoder_token_dtype": "int32",
15
  "text_encoder_runtime_config": "configs/text_encoder_runtime_config.json"
16
  }
 
11
  "fp32",
12
  "int8"
13
  ],
14
+ "profiles": {
15
+ "android-qnn-npu": {
16
+ "platform": "android",
17
+ "preferred_accelerator": "NPU",
18
+ "delegate": "LiteRT Qualcomm AI Engine Direct (QNN)",
19
+ "notes": [
20
+ "Mixed deployment profile for the Qualcomm NPU path through LiteRT CompiledModel.",
21
+ "This notebook still exports LiteRT/TFLite submodels, not Qualcomm-specific AOT context binaries.",
22
+ "Android packaging still needs Qualcomm LiteRT runtime libraries and arm64-v8a delivery."
23
+ ],
24
+ "source_variant": "int8",
25
+ "files": {
26
+ "text_encoder": "fp32/text_encoder.tflite",
27
+ "unet": "int8/unet.tflite",
28
+ "vae_decoder": "fp32/vae_decoder.tflite"
29
+ },
30
+ "quantization": "fp32 text encoder + dynamic int8 UNet + fp32 VAE"
31
+ },
32
+ "android-cpu": {
33
+ "platform": "android",
34
+ "preferred_accelerator": "CPU",
35
+ "delegate": "LiteRT CPU/XNNPACK",
36
+ "notes": [
37
+ "Conservative fallback profile for Android when GPU/NPU compilation is unavailable.",
38
+ "Reuses the mixed int8 UNet path for smaller downloads and lower RAM pressure."
39
+ ],
40
+ "source_variant": "int8",
41
+ "files": {
42
+ "text_encoder": "fp32/text_encoder.tflite",
43
+ "unet": "int8/unet.tflite",
44
+ "vae_decoder": "fp32/vae_decoder.tflite"
45
+ },
46
+ "quantization": "fp32 text encoder + dynamic int8 UNet + fp32 VAE"
47
+ },
48
+ "android-gpu": {
49
+ "platform": "android",
50
+ "preferred_accelerator": "GPU",
51
+ "delegate": "LiteRT GPU delegate",
52
+ "notes": [
53
+ "Uses the float export path because LiteRT GPU delegates are the most predictable there.",
54
+ "The text encoder still prefers INT32 token ids to avoid delegate-hostile INT64 input graphs."
55
+ ],
56
+ "source_variant": "fp32",
57
+ "files": {
58
+ "text_encoder": "fp32/text_encoder.tflite",
59
+ "unet": "fp32/unet.tflite",
60
+ "vae_decoder": "fp32/vae_decoder.tflite"
61
+ },
62
+ "quantization": "fp32"
63
+ },
64
+ "ios-coreml": {
65
+ "platform": "ios",
66
+ "preferred_accelerator": "CORE_ML",
67
+ "delegate": "LiteRT Core ML delegate",
68
+ "notes": [
69
+ "Core ML delegate currently supports float models, so this profile stays on the float export path.",
70
+ "This notebook exports LiteRT/TFLite artifacts for the LiteRT Core ML delegate, not native `.mlmodel` files."
71
+ ],
72
+ "source_variant": "fp32",
73
+ "files": {
74
+ "text_encoder": "fp32/text_encoder.tflite",
75
+ "unet": "fp32/unet.tflite",
76
+ "vae_decoder": "fp32/vae_decoder.tflite"
77
+ },
78
+ "quantization": "fp32",
79
+ "minimum_os": "iOS 12"
80
+ }
81
+ },
82
+ "android_profile_priority": {
83
+ "GPU": "android-gpu",
84
+ "NPU": "android-qnn-npu",
85
+ "CPU": "android-cpu"
86
+ },
87
+ "legacy_default_variant": "int8",
88
  "preferred_text_encoder_token_dtype": "int32",
89
  "text_encoder_runtime_config": "configs/text_encoder_runtime_config.json"
90
  }
fp32/manifest.json CHANGED
@@ -5,6 +5,10 @@
5
  "unet": "unet.tflite",
6
  "vae_decoder": "vae_decoder.tflite"
7
  },
 
 
 
 
8
  "text_encoder_export": {
9
  "selected": {
10
  "requested_token_dtype": "int32",
 
5
  "unet": "unet.tflite",
6
  "vae_decoder": "vae_decoder.tflite"
7
  },
8
+ "deployment_profiles": [
9
+ "android-gpu",
10
+ "ios-coreml"
11
+ ],
12
  "text_encoder_export": {
13
  "selected": {
14
  "requested_token_dtype": "int32",
fp32/unet.tflite CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5587bbd662f3955ba95cfce59c04dbcae44f973680c1a2cc230de93b6ace0eaf
3
  size 3439837600
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:afd9fe4f2203ae292b756154ffc8d0f42aac64fa0a92c6faff8d61c7ee6b3ef8
3
  size 3439837600
int8/manifest.json CHANGED
@@ -6,6 +6,10 @@
6
  "unet": "unet.tflite",
7
  "vae_decoder": "vae_decoder.tflite"
8
  },
 
 
 
 
9
  "text_encoder_export": {
10
  "selected": {
11
  "requested_token_dtype": "int32",
 
6
  "unet": "unet.tflite",
7
  "vae_decoder": "vae_decoder.tflite"
8
  },
9
+ "deployment_profiles": [
10
+ "android-qnn-npu",
11
+ "android-cpu"
12
+ ],
13
  "text_encoder_export": {
14
  "selected": {
15
  "requested_token_dtype": "int32",