EulerHuaji zhangjiewu commited on
Commit
d13ed6c
·
verified ·
0 Parent(s):

Duplicate from nvidia/difix_ref

Browse files

Co-authored-by: Jay Wu <zhangjiewu@users.noreply.huggingface.co>

.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
LICENSE.txt ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ NVIDIA License
2
+
3
+ 1. Definitions
4
+
5
+ “Licensor” means any person or entity that distributes its Work.
6
+ “Work” means (a) the original work of authorship made available under this license, which may include software, documentation, or other files, and (b) any additions to or derivative works thereof that are made available under this license.
7
+ The terms “reproduce,” “reproduction,” “derivative works,” and “distribution” have the meaning as provided under U.S. copyright law; provided, however, that for the purposes of this license, derivative works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work.
8
+ Works are “made available” under this license by including in or with the Work either (a) a copyright notice referencing the applicability of this license to the Work, or (b) a copy of this license.
9
+
10
+ 2. License Grant
11
+
12
+ 2.1 Copyright Grant. Subject to the terms and conditions of this license, each Licensor grants to you a perpetual, worldwide, non-exclusive, royalty-free, copyright license to use, reproduce, prepare derivative works of, publicly display, publicly perform, sublicense and distribute its Work and any resulting derivative works in any form.
13
+
14
+ 3. Limitations
15
+
16
+ 3.1 Redistribution. You may reproduce or distribute the Work only if (a) you do so under this license, (b) you include a complete copy of this license with your distribution, and (c) you retain without modification any copyright, patent, trademark, or attribution notices that are present in the Work.
17
+
18
+ 3.2 Derivative Works. You may specify that additional or different terms apply to the use, reproduction, and distribution of your derivative works of the Work (“Your Terms”) only if (a) Your Terms provide that the use limitation in Section 3.3 applies to your derivative works, and (b) you identify the specific derivative works that are subject to Your Terms. Notwithstanding Your Terms, this license (including the redistribution requirements in Section 3.1) will continue to apply to the Work itself.
19
+
20
+ 3.3 Use Limitation. The Work and any derivative works thereof only may be used or intended for use non-commercially. Notwithstanding the foregoing, NVIDIA Corporation and its affiliates may use the Work and any derivative works commercially. As used herein, “non-commercially” means for research or evaluation purposes only.
21
+
22
+ 3.4 Patent Claims. If you bring or threaten to bring a patent claim against any Licensor (including any claim, cross-claim or counterclaim in a lawsuit) to enforce any patents that you allege are infringed by any Work, then your rights under this license from such Licensor (including the grant in Section 2.1) will terminate immediately.
23
+
24
+ 3.5 Trademarks. This license does not grant any rights to use any Licensor’s or its affiliates’ names, logos, or trademarks, except as necessary to reproduce the notices described in this license.
25
+
26
+ 3.6 Termination. If you violate any term of this license, then your rights under this license (including the grant in Section 2.1) will terminate immediately.
27
+
28
+ 4. Disclaimer of Warranty.
29
+
30
+ THE WORK IS PROVIDED “AS IS” WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WARRANTIES OR CONDITIONS OF
31
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE OR NON-INFRINGEMENT. YOU BEAR THE RISK OF UNDERTAKING ANY ACTIVITIES UNDER THIS LICENSE.
32
+
33
+ 5. Limitation of Liability.
34
+
35
+ EXCEPT AS PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL THEORY, WHETHER IN TORT (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE SHALL ANY LICENSOR BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF OR RELATED TO THIS LICENSE, THE USE OR INABILITY TO USE THE WORK (INCLUDING BUT NOT LIMITED TO LOSS OF GOODWILL, BUSINESS INTERRUPTION, LOST PROFITS OR DATA, COMPUTER FAILURE OR MALFUNCTION, OR ANY OTHER DAMAGES OR LOSSES), EVEN IF THE LICENSOR HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
36
+
37
+ STABILITY AI COMMUNITY LICENSE AGREEMENT
38
+
39
+ Last Updated: July 5, 2024
40
+
41
+ INTRODUCTION
42
+ This Agreement applies to any individual person or entity (“You”, “Your” or “Licensee”) that uses or distributes any portion or element of the Stability AI Materials or Derivative Works thereof for any Research & Non-Commercial or Commercial purpose. Capitalized terms not otherwise defined herein are defined in Section V below.
43
+
44
+ This Agreement is intended to allow research, non-commercial, and limited commercial uses of the Models free of charge. In order to ensure that certain limited commercial uses of the Models continue to be allowed, this Agreement preserves free access to the Models for people or organizations generating annual revenue of less than US $1,000,000 (or local currency equivalent).
45
+
46
+ By clicking “I Accept” or by using or distributing or using any portion or element of the Stability Materials or Derivative Works, You agree that You have read, understood and are bound by the terms of this Agreement. If You are acting on behalf of a company, organization or other entity, then “You” includes you and that entity, and You agree that You: (i) are an authorized representative of such entity with the authority to bind such entity to this Agreement, and (ii) You agree to the terms of this Agreement on that entity’s behalf.
47
+
48
+ RESEARCH & NON-COMMERCIAL USE LICENSE
49
+ Subject to the terms of this Agreement, Stability AI grants You a non-exclusive, worldwide, non-transferable, non-sublicensable, revocable and royalty-free limited license under Stability AI’s intellectual property or other rights owned by Stability AI embodied in the Stability AI Materials to use, reproduce, distribute, and create Derivative Works of, and make modifications to, the Stability AI Materials for any Research or Non-Commercial Purpose. “Research Purpose” means academic or scientific advancement, and in each case, is not primarily intended for commercial advantage or monetary compensation to You or others. “Non-Commercial Purpose” means any purpose other than a Research Purpose that is not primarily intended for commercial advantage or monetary compensation to You or others, such as personal use (i.e., hobbyist) or evaluation and testing.
50
+
51
+ COMMERCIAL USE LICENSE
52
+ Subject to the terms of this Agreement (including the remainder of this Section III), Stability AI grants You a non-exclusive, worldwide, non-transferable, non-sublicensable, revocable and royalty-free limited license under Stability AI’s intellectual property or other rights owned by Stability AI embodied in the Stability AI Materials to use, reproduce, distribute, and create Derivative Works of, and make modifications to, the Stability AI Materials for any Commercial Purpose. “Commercial Purpose” means any purpose other than a Research Purpose or Non-Commercial Purpose that is primarily intended for commercial advantage or monetary compensation to You or others, including but not limited to, (i) creating, modifying, or distributing Your product or service, including via a hosted service or application programming interface, and (ii) for Your business’s or organization’s internal operations. If You are using or distributing the Stability AI Materials for a Commercial Purpose, You must register with Stability AI at (https://stability.ai/community-license). If at any time You or Your Affiliate(s), either individually or in aggregate, generate more than USD $1,000,000 in annual revenue (or the equivalent thereof in Your local currency), regardless of whether that revenue is generated directly or indirectly from the Stability AI Materials or Derivative Works, any licenses granted to You under this Agreement shall terminate as of such date. You must request a license from Stability AI at (https://stability.ai/enterprise) , which Stability AI may grant to You in its sole discretion. If you receive Stability AI Materials, or any Derivative Works thereof, from a Licensee as part of an integrated end user product, then Section III of this Agreement will not apply to you.
53
+
54
+ GENERAL TERMS
55
+ Your Research, Non-Commercial, and Commercial License(s) under this Agreement are subject to the following terms. a. Distribution & Attribution. If You distribute or make available the Stability AI Materials or a Derivative Work to a third party, or a product or service that uses any portion of them, You shall: (i) provide a copy of this Agreement to that third party, (ii) retain the following attribution notice within a "Notice" text file distributed as a part of such copies: "This Stability AI Model is licensed under the Stability AI Community License, Copyright © Stability AI Ltd. All Rights Reserved”, and (iii) prominently display “Powered by Stability AI” on a related website, user interface, blogpost, about page, or product documentation. If You create a Derivative Work, You may add your own attribution notice(s) to the “Notice” text file included with that Derivative Work, provided that You clearly indicate which attributions apply to the Stability AI Materials and state in the “Notice” text file that You changed the Stability AI Materials and how it was modified. b. Use Restrictions. Your use of the Stability AI Materials and Derivative Works, including any output or results of the Stability AI Materials or Derivative Works, must comply with applicable laws and regulations (including Trade Control Laws and equivalent regulations) and adhere to the Documentation and Stability AI’s AUP, which is hereby incorporated by reference. Furthermore, You will not use the Stability AI Materials or Derivative Works, or any output or results of the Stability AI Materials or Derivative Works, to create or improve any foundational generative AI model (excluding the Models or Derivative Works). c. Intellectual Property. (i) Trademark License. No trademark licenses are granted under this Agreement, and in connection with the Stability AI Materials or Derivative Works, You may not use any name or mark owned by or associated with Stability AI or any of its Affiliates, except as required under Section IV(a) herein. (ii) Ownership of Derivative Works. As between You and Stability AI, You are the owner of Derivative Works You create, subject to Stability AI’s ownership of the Stability AI Materials and any Derivative Works made by or for Stability AI. (iii) Ownership of Outputs. As between You and Stability AI, You own any outputs generated from the Models or Derivative Works to the extent permitted by applicable law. (iv) Disputes. If You or Your Affiliate(s) institute litigation or other proceedings against Stability AI (including a cross-claim or counterclaim in a lawsuit) alleging that the Stability AI Materials, Derivative Works or associated outputs or results, or any portion of any of the foregoing, constitutes infringement of intellectual property or other rights owned or licensable by You, then any licenses granted to You under this Agreement shall terminate as of the date such litigation or claim is filed or instituted. You will indemnify and hold harmless Stability AI from and against any claim by any third party arising out of or related to Your use or distribution of the Stability AI Materials or Derivative Works in violation of this Agreement. (v) Feedback. From time to time, You may provide Stability AI with verbal and/or written suggestions, comments or other feedback related to Stability AI’s existing or prospective technology, products or services (collectively, “Feedback”). You are not obligated to provide Stability AI with Feedback, but to the extent that You do, You hereby grant Stability AI a perpetual, irrevocable, royalty-free, fully-paid, sub-licensable, transferable, non-exclusive, worldwide right and license to exploit the Feedback in any manner without restriction. Your Feedback is provided “AS IS” and You make no warranties whatsoever about any Feedback. d. Disclaimer Of Warranty. UNLESS REQUIRED BY APPLICABLE LAW, THE STABILITY AI MATERIALS AND ANY OUTPUT AND RESULTS THEREFROM ARE PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, ANY WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. YOU ARE SOLELY RESPONSIBLE FOR DETERMINING THE APPROPRIATENESS OR LAWFULNESS OF USING OR REDISTRIBUTING THE STABILITY AI MATERIALS, DERIVATIVE WORKS OR ANY OUTPUT OR RESULTS AND ASSUME ANY RISKS ASSOCIATED WITH YOUR USE OF THE STABILITY AI MATERIALS, DERIVATIVE WORKS AND ANY OUTPUT AND RESULTS. e. Limitation Of Liability. IN NO EVENT WILL STABILITY AI OR ITS AFFILIATES BE LIABLE UNDER ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, TORT, NEGLIGENCE, PRODUCTS LIABILITY, OR OTHERWISE, ARISING OUT OF THIS AGREEMENT, FOR ANY LOST PROFITS OR ANY DIRECT, INDIRECT, SPECIAL, CONSEQUENTIAL, INCIDENTAL, EXEMPLARY OR PUNITIVE DAMAGES, EVEN IF STABILITY AI OR ITS AFFILIATES HAVE BEEN ADVISED OF THE POSSIBILITY OF ANY OF THE FOREGOING. f. Term And Termination. The term of this Agreement will commence upon Your acceptance of this Agreement or access to the Stability AI Materials and will continue in full force and effect until terminated in accordance with the terms and conditions herein. Stability AI may terminate this Agreement if You are in breach of any term or condition of this Agreement. Upon termination of this Agreement, You shall delete and cease use of any Stability AI Materials or Derivative Works. Section IV(d), (e), and (g) shall survive the termination of this Agreement. g. Governing Law. This Agreement will be governed by and constructed in accordance with the laws of the United States and the State of California without regard to choice of law principles, and the UN Convention on Contracts for International Sale of Goods does not apply to this Agreement.
56
+
57
+ DEFINITIONS
58
+ “Affiliate(s)” means any entity that directly or indirectly controls, is controlled by, or is under common control with the subject entity; for purposes of this definition, “control” means direct or indirect ownership or control of more than 50% of the voting interests of the subject entity.
59
+
60
+ "Agreement" means this Stability AI Community License Agreement.
61
+
62
+ “AUP” means the Stability AI Acceptable Use Policy available at (https://stability.ai/use-policy), as may be updated from time to time.
63
+
64
+ "Derivative Work(s)” means (a) any derivative work of the Stability AI Materials as recognized by U.S. copyright laws and (b) any modifications to a Model, and any other model created which is based on or derived from the Model or the Model’s output, including “fine tune” and “low-rank adaptation” models derived from a Model or a Model’s output, but do not include the output of any Model.
65
+
66
+ “Documentation” means any specifications, manuals, documentation, and other written information provided by Stability AI related to the Software or Models.
67
+
68
+ “Model(s)" means, collectively, Stability AI’s proprietary models and algorithms, including machine-learning models, trained model weights and other elements of the foregoing listed on Stability’s Core Models Webpage available at (https://stability.ai/core-models), as may be updated from time to time.
69
+
70
+ "Stability AI" or "we" means Stability AI Ltd. and its Affiliates.
71
+
72
+ "Software" means Stability AI’s proprietary software made available under this Agreement now or in the future.
73
+
74
+ “Stability AI Materials” means, collectively, Stability’s proprietary Models, Software and Documentation (and any portion or combination thereof) made available under this Agreement.
75
+
76
+ “Trade Control Laws” means any applicable U.S. and non-U.S. export control and trade sanctions laws and regulations.
NOTICE ADDED
@@ -0,0 +1 @@
 
 
1
+ This Stability AI Model is licensed under the Stability AI Community License, Copyright © Stability AI Ltd. All Rights Reserved
README.md ADDED
@@ -0,0 +1,159 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ datasets:
3
+ - DL3DV/DL3DV-10K-Sample
4
+ language:
5
+ - en
6
+ ---
7
+ # **Difix3D+: Improving 3D Reconstructions with Single-Step Diffusion Models**
8
+ CVPR 2025 (Oral)
9
+ [**Code**](https://github.com/nv-tlabs/Difix3D) | [**Project Page**](https://research.nvidia.com/labs/toronto-ai/difix3d/) | [**Paper**](https://arxiv.org/abs/2503.01774)
10
+
11
+ ## Description:
12
+ Difix is a single-step image diffusion model trained to enhance and remove artifacts in rendered novel views caused by
13
+ underconstrained regions of 3D representation. The technology behind Difix is based on the concepts outlined in the paper titled
14
+ [DIFIX3D+: Improving 3D Reconstructions with Single-Step Diffusion Models](https://arxiv.org/abs/2503.01774 ).
15
+
16
+ Difix has two operation modes:
17
+
18
+ * Offline mode: Used during the reconstruction phase to clean up pseudo-training views that are rendered from the reconstruction
19
+ and then distill them back into 3D. This greatly enhances underconstrained regions and improves the overall 3D representation quality.
20
+ * Online mode: Acts as a neural enhancer during inference, effectively removing residual artifacts arising from imperfect 3D
21
+ supervision and the limited capacity of current reconstruction models.
22
+
23
+ Difix is an all-encompassing solution, a single model compatible for both NeRF and 3DGS representations.
24
+
25
+ **This model is ready for research and development/non-commercial use only.**
26
+
27
+ **Model Developer:** NVIDIA
28
+
29
+ **Model Versions:** difix_ref
30
+
31
+ **Deployment Geography:** Global
32
+
33
+ ### License/Terms of Use:
34
+ The use of the model and code is governed by the NVIDIA License. Additional Information: [LICENSE.md · stabilityai/sd-turbo at main](https://huggingface.co/stabilityai/sd-turbo/blob/main/LICENSE.md)
35
+
36
+
37
+ ### Use Case:
38
+ Difix is intended for Physical AI developers looking to enhance and improve their Neural Reconstruction pipelines. The model takes an image as an input and outputs a fixed image
39
+
40
+ **Release Date:** Github: [June 2025](https://github.com/nv-tlabs/Difix3D)
41
+
42
+ ## Model Architecture
43
+
44
+ **Architecture Type**: UNet
45
+
46
+ **Network Architecture**: A latent diffusion-based UNet coupled with a variational autoencoder (VAE).
47
+
48
+ ## Input
49
+
50
+ **Input Type(s)**: Image
51
+
52
+ **Input Format(s)**: Red, Green, Blue (RGB)
53
+
54
+ **Input Parameters**: Two-Dimensional (2D)
55
+
56
+ **Other Properties Related to Input**:
57
+ * Specific Resolution: [576px x 1024px]
58
+
59
+ ## Output
60
+
61
+ **Output Type(s)**: Image
62
+
63
+ **Output Format(s)**: Red, Green, Blue (RGB)
64
+
65
+ **Output Parameters**: Two-Dimensional (2D)
66
+
67
+ **Other Properties Related to Output**:
68
+ * Specific Resolution: [576px x 1024px]
69
+
70
+ ## Software Integration
71
+
72
+ **Runtime Engine(s)**: PyTorch
73
+
74
+ **Supported Hardware Microarchitecture Compatibility**:
75
+ * NVIDIA Ampere
76
+ * NVIDIA Hopper
77
+
78
+ **Note**: We are testing with FP32 Precision.
79
+
80
+ ## Inference
81
+ **Acceleration Engine**: [PyTorch](https://pytorch.org/)
82
+
83
+ **Test Hardware**:
84
+ * A100
85
+ * H100
86
+
87
+ **Operating System(s):** Linux (We have not tested on other operating systems.)
88
+
89
+ **System Requirements and Performance:**
90
+ This model requires X GB of GPU VRAM.
91
+ The following table shows inference time for a single generation across different NVIDIA GPU hardware:
92
+
93
+ | GPU Hardware | Inference Runtime |
94
+ |--------------|----------------------------|
95
+ | NVIDIA A100 | 0.355 sec |
96
+ | NVIDIA H100 | 0.223 sec |
97
+
98
+ ## Use the Difix Model
99
+ Please visit the [Difix3D repository](https://github.com/nv-tlabs/Difix3D) to access all relevant files and code needed to use Difix
100
+
101
+
102
+ ## Difix Dataset
103
+ - Data Collection Method: Human
104
+ - Labeling Method by Dataset: Human
105
+ - Properties: Difix was trained, tested, and evaluated using the [DL3DV-10k dataset](https://huggingface.co/datasets/DL3DV/DL3DV-10K-Sample), where 80% of the data was used for training, 10% for evaluation, and 10% for testing. DL3DV-10K is a large-scale dataset consisting of 10,510 high-resolution (4K) real-world video sequences, totaling approximately 51.2 million frames. The scenes span 65 diverse categories across indoor and outdoor environments. Each video is accompanied by metadata describing environmental conditions such as lighting (natural, artificial, mixed), surface materials (e.g., reflective or transparent), and texture complexity. The dataset is designed to support the development and evaluation of learning-based 3D vision methods.
106
+
107
+
108
+ ## Ethical Considerations:
109
+ NVIDIA believes Trustworthy AI is a shared responsibility and we have established policies and practices to enable development for a wide array of AI applications. When downloaded or used in accordance with our terms of service, developers should work with their internal model team to ensure this model meets requirements for the relevant industry and use case and addresses unforeseen product misuse.
110
+
111
+ Please report security vulnerabilities or NVIDIA AI Concerns [here](https://www.nvidia.com/en-us/support/submit-security-vulnerability/)
112
+
113
+ ---
114
+
115
+ ## ModelCard++
116
+
117
+ ### Bias
118
+
119
+ | Field | Response |
120
+ | :--------------------------------------------------------------------------------------------------------------------------------------------------------------- | :------- |
121
+ | Participation considerations from adversely impacted groups [protected classes](https://www.senate.ca.gov/content/protected-classes) in model design and testing: | None |
122
+ | Measures taken to mitigate against unwanted bias: | None |
123
+
124
+ ### Explainability
125
+
126
+ | Field | Response |
127
+ | :-------------------------------------------------------- | :------------------------------------------------------------------------------------------------------------------- |
128
+ | Intended Domain: | Advanced Driver Assistance Systems |
129
+ | Model Type: | Image-to-Image |
130
+ | Intended Users: | Autonomous Vehicles developers enhancing and improving Neural Reconstruction pipelines. |
131
+ | Output: | Image |
132
+ | Describe how the model works: | The model takes as an input an image, and outputs a fixed image |
133
+ | Name the adversely impacted groups this has been tested to deliver comparable outcomes regardless of: | None |
134
+ | Technical Limitations: | The reconstruction relies on the quality and consistency of input images and camera calibrations; any deficiencies in these areas can negatively impact the final output. |
135
+ | Verified to have met prescribed NVIDIA quality standards: | Yes |
136
+ | Performance Metrics: | FID (Fréchet Inception Distance), PSNR (Peak Signal-to-Noise Ratio), LPIPS (Learned Perceptual Image Patch Similarity) |
137
+ | Potential Known Risks: | The model is not guaranteed to fix 100% of the image artifacts. please verify the generated scenarios are context and use appropriate. |
138
+ | Licensing: | The use of the model and code is governed by the NVIDIA License. Additional Information: [LICENSE.md · stabilityai/sd-turbo at main](https://huggingface.co/stabilityai/sd-turbo/blob/main/LICENSE.md). |
139
+
140
+ ### Privacy
141
+
142
+ | Field | Response |
143
+ | :------------------------------------------------------------------ | :------------- |
144
+ | Generatable or reverse engineerable personal data? | No |
145
+ | Personal data used to create this model? | No |
146
+ | How often is the dataset reviewed? | Before release |
147
+ | Is there provenance for all datasets used in training? | Yes |
148
+ | Does data labeling (annotation, metadata) comply with privacy laws? | Yes |
149
+ | Is data compliant with data subject requests for data correction or removal, if such a request was made? | Yes |
150
+
151
+ ### Safety & Security
152
+
153
+ | Field | Response |
154
+ | :---------------------------------------------- | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
155
+ | Model Application(s): | Image Enhancement |
156
+ | List types of specific high-risk AI systems, if any, in which the model can be integrated: | The model can be used to develop Autonomous Vehicles stacks that can be integrated inside vehicles. The Difix model should not be deployed in a vehicle. |
157
+ | Describe the life critical impact (if present). | N/A - The model should not be deployed in a vehicle and will not perform life-critical tasks. |
158
+ | Use Case Restrictions: | Your use of the model and code is governed by the NVIDIA License. Additional Information: LICENSE.md · stabilityai/sd-turbo at main |
159
+ | Model and dataset restrictions: | The Principle of least privilege (PoLP) is applied limiting access for dataset generation and model development. Restrictions enforce dataset access during training, and dataset license constraints adhered to. |
model_index.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "DifixPipeline",
3
+ "_diffusers_version": "0.25.1",
4
+ "_name_or_path": "nvidia/difix_ref",
5
+ "feature_extractor": [
6
+ null,
7
+ null
8
+ ],
9
+ "image_encoder": [
10
+ null,
11
+ null
12
+ ],
13
+ "requires_safety_checker": true,
14
+ "safety_checker": [
15
+ null,
16
+ null
17
+ ],
18
+ "scheduler": [
19
+ "diffusers",
20
+ "DDPMScheduler"
21
+ ],
22
+ "text_encoder": [
23
+ "transformers",
24
+ "CLIPTextModel"
25
+ ],
26
+ "tokenizer": [
27
+ "transformers",
28
+ "CLIPTokenizer"
29
+ ],
30
+ "unet": [
31
+ "unet_2d_condition",
32
+ "UNet2DConditionModel"
33
+ ],
34
+ "vae": [
35
+ "autoencoder_kl",
36
+ "AutoencoderKL"
37
+ ]
38
+ }
scheduler/scheduler_config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "DDPMScheduler",
3
+ "_diffusers_version": "0.25.1",
4
+ "beta_end": 0.012,
5
+ "beta_schedule": "scaled_linear",
6
+ "beta_start": 0.00085,
7
+ "clip_sample": false,
8
+ "clip_sample_range": 1.0,
9
+ "dynamic_thresholding_ratio": 0.995,
10
+ "interpolation_type": "linear",
11
+ "num_train_timesteps": 1000,
12
+ "prediction_type": "epsilon",
13
+ "rescale_betas_zero_snr": false,
14
+ "sample_max_value": 1.0,
15
+ "set_alpha_to_one": false,
16
+ "sigma_max": null,
17
+ "sigma_min": null,
18
+ "skip_prk_steps": true,
19
+ "steps_offset": 1,
20
+ "thresholding": false,
21
+ "timestep_spacing": "trailing",
22
+ "timestep_type": "discrete",
23
+ "trained_betas": null,
24
+ "use_karras_sigmas": false,
25
+ "variance_type": "fixed_small"
26
+ }
text_encoder/config.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "nvidia/difix_ref/text_encoder",
3
+ "architectures": [
4
+ "CLIPTextModel"
5
+ ],
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 0,
8
+ "dropout": 0.0,
9
+ "eos_token_id": 2,
10
+ "hidden_act": "gelu",
11
+ "hidden_size": 1024,
12
+ "initializer_factor": 1.0,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 4096,
15
+ "layer_norm_eps": 1e-05,
16
+ "max_position_embeddings": 77,
17
+ "model_type": "clip_text_model",
18
+ "num_attention_heads": 16,
19
+ "num_hidden_layers": 23,
20
+ "pad_token_id": 1,
21
+ "projection_dim": 512,
22
+ "torch_dtype": "float32",
23
+ "transformers_version": "4.48.3",
24
+ "vocab_size": 49408
25
+ }
text_encoder/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:67e013543d4fac905c882e2993d86a2d454ee69dc9e8f37c0c23d33a48959d15
3
+ size 1361596304
tokenizer/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer/special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<|startoftext|>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|endoftext|>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "!",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "<|endoftext|>",
25
+ "lstrip": false,
26
+ "normalized": true,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
tokenizer/tokenizer_config.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "0": {
5
+ "content": "!",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "49406": {
13
+ "content": "<|startoftext|>",
14
+ "lstrip": false,
15
+ "normalized": true,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "49407": {
21
+ "content": "<|endoftext|>",
22
+ "lstrip": false,
23
+ "normalized": true,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ }
28
+ },
29
+ "bos_token": "<|startoftext|>",
30
+ "clean_up_tokenization_spaces": true,
31
+ "do_lower_case": true,
32
+ "eos_token": "<|endoftext|>",
33
+ "errors": "replace",
34
+ "extra_special_tokens": {},
35
+ "model_max_length": 77,
36
+ "pad_token": "!",
37
+ "tokenizer_class": "CLIPTokenizer",
38
+ "unk_token": "<|endoftext|>"
39
+ }
tokenizer/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
unet/config.json ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "UNet2DConditionModel",
3
+ "_diffusers_version": "0.25.1",
4
+ "_name_or_path": "nvidia/difix_ref/unet",
5
+ "act_fn": "silu",
6
+ "addition_embed_type": null,
7
+ "addition_embed_type_num_heads": 64,
8
+ "addition_time_embed_dim": null,
9
+ "attention_head_dim": [
10
+ 5,
11
+ 10,
12
+ 20,
13
+ 20
14
+ ],
15
+ "attention_type": "default",
16
+ "block_out_channels": [
17
+ 320,
18
+ 640,
19
+ 1280,
20
+ 1280
21
+ ],
22
+ "center_input_sample": false,
23
+ "class_embed_type": null,
24
+ "class_embeddings_concat": false,
25
+ "conv_in_kernel": 3,
26
+ "conv_out_kernel": 3,
27
+ "cross_attention_dim": 1024,
28
+ "cross_attention_norm": null,
29
+ "down_block_types": [
30
+ "CrossAttnDownBlock2D",
31
+ "CrossAttnDownBlock2D",
32
+ "CrossAttnDownBlock2D",
33
+ "DownBlock2D"
34
+ ],
35
+ "downsample_padding": 1,
36
+ "dropout": 0.0,
37
+ "dual_cross_attention": false,
38
+ "encoder_hid_dim": null,
39
+ "encoder_hid_dim_type": null,
40
+ "flip_sin_to_cos": true,
41
+ "freq_shift": 0,
42
+ "in_channels": 4,
43
+ "layers_per_block": 2,
44
+ "mid_block_only_cross_attention": null,
45
+ "mid_block_scale_factor": 1,
46
+ "mid_block_type": "UNetMidBlock2DCrossAttn",
47
+ "norm_eps": 1e-05,
48
+ "norm_num_groups": 32,
49
+ "num_attention_heads": null,
50
+ "num_class_embeds": null,
51
+ "only_cross_attention": false,
52
+ "out_channels": 4,
53
+ "projection_class_embeddings_input_dim": null,
54
+ "resnet_out_scale_factor": 1.0,
55
+ "resnet_skip_time_act": false,
56
+ "resnet_time_scale_shift": "default",
57
+ "reverse_transformer_layers_per_block": null,
58
+ "sample_size": 64,
59
+ "time_cond_proj_dim": null,
60
+ "time_embedding_act_fn": null,
61
+ "time_embedding_dim": null,
62
+ "time_embedding_type": "positional",
63
+ "timestep_post_act": null,
64
+ "transformer_layers_per_block": 1,
65
+ "up_block_types": [
66
+ "UpBlock2D",
67
+ "CrossAttnUpBlock2D",
68
+ "CrossAttnUpBlock2D",
69
+ "CrossAttnUpBlock2D"
70
+ ],
71
+ "upcast_attention": null,
72
+ "use_linear_projection": true
73
+ }
unet/diffusion_pytorch_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cf723d40f29c6915b6ac2aac3c1dab4fe685afe35a41e725ad63a34124c0ec46
3
+ size 3463726504
unet/unet_2d_condition.py ADDED
@@ -0,0 +1,1343 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2023 The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ from dataclasses import dataclass
15
+ from typing import Any, Dict, List, Optional, Tuple, Union
16
+
17
+ import torch
18
+ import torch.nn as nn
19
+ import torch.utils.checkpoint
20
+
21
+ from diffusers.configuration_utils import ConfigMixin, register_to_config
22
+ from diffusers.loaders import UNet2DConditionLoadersMixin
23
+ from diffusers.utils import USE_PEFT_BACKEND, BaseOutput, deprecate, logging, scale_lora_layers, unscale_lora_layers
24
+ from diffusers.models.activations import get_activation
25
+ from diffusers.models.attention_processor import (
26
+ ADDED_KV_ATTENTION_PROCESSORS,
27
+ CROSS_ATTENTION_PROCESSORS,
28
+ Attention,
29
+ AttentionProcessor,
30
+ AttnAddedKVProcessor,
31
+ AttnProcessor,
32
+ )
33
+ from diffusers.models.embeddings import (
34
+ GaussianFourierProjection,
35
+ ImageHintTimeEmbedding,
36
+ ImageProjection,
37
+ ImageTimeEmbedding,
38
+ PositionNet,
39
+ TextImageProjection,
40
+ TextImageTimeEmbedding,
41
+ TextTimeEmbedding,
42
+ TimestepEmbedding,
43
+ Timesteps,
44
+ )
45
+ from diffusers.models.modeling_utils import ModelMixin
46
+ from diffusers.models.unet_2d_blocks import (
47
+ UNetMidBlock2D,
48
+ UNetMidBlock2DCrossAttn,
49
+ UNetMidBlock2DSimpleCrossAttn,
50
+ get_down_block,
51
+ get_up_block,
52
+ )
53
+
54
+
55
+ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
56
+
57
+
58
+ from diffusers.models.attention import BasicTransformerBlock, _chunked_feed_forward
59
+ from einops import rearrange
60
+
61
+ def new_forward(
62
+ self,
63
+ hidden_states: torch.FloatTensor,
64
+ attention_mask: Optional[torch.FloatTensor] = None,
65
+ encoder_hidden_states: Optional[torch.FloatTensor] = None,
66
+ encoder_attention_mask: Optional[torch.FloatTensor] = None,
67
+ timestep: Optional[torch.LongTensor] = None,
68
+ cross_attention_kwargs: Dict[str, Any] = None,
69
+ class_labels: Optional[torch.LongTensor] = None,
70
+ added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None,
71
+ ) -> torch.FloatTensor:
72
+ # Notice that normalization is always applied before the real computation in the following blocks.
73
+ # 0. Self-Attention
74
+
75
+ num_views = 2 # Assuming 2 views for simplicity, can be parameterized later
76
+ hidden_states = rearrange(hidden_states, "(b v) n d -> b (v n) d", v=num_views)
77
+ batch_size = hidden_states.shape[0]
78
+
79
+ if self.use_ada_layer_norm:
80
+ norm_hidden_states = self.norm1(hidden_states, timestep)
81
+ elif self.use_ada_layer_norm_zero:
82
+ norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(
83
+ hidden_states, timestep, class_labels, hidden_dtype=hidden_states.dtype
84
+ )
85
+ elif self.use_layer_norm:
86
+ norm_hidden_states = self.norm1(hidden_states)
87
+ elif self.use_ada_layer_norm_continuous:
88
+ norm_hidden_states = self.norm1(hidden_states, added_cond_kwargs["pooled_text_emb"])
89
+ elif self.use_ada_layer_norm_single:
90
+ shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = (
91
+ self.scale_shift_table[None] + timestep.reshape(batch_size, 6, -1)
92
+ ).chunk(6, dim=1)
93
+ norm_hidden_states = self.norm1(hidden_states)
94
+ norm_hidden_states = norm_hidden_states * (1 + scale_msa) + shift_msa
95
+ norm_hidden_states = norm_hidden_states.squeeze(1)
96
+ else:
97
+ raise ValueError("Incorrect norm used")
98
+
99
+ if self.pos_embed is not None:
100
+ norm_hidden_states = self.pos_embed(norm_hidden_states)
101
+
102
+ # 1. Retrieve lora scale.
103
+ lora_scale = cross_attention_kwargs.get("scale", 1.0) if cross_attention_kwargs is not None else 1.0
104
+
105
+ # 2. Prepare GLIGEN inputs
106
+ cross_attention_kwargs = cross_attention_kwargs.copy() if cross_attention_kwargs is not None else {}
107
+ gligen_kwargs = cross_attention_kwargs.pop("gligen", None)
108
+
109
+ attn_output = self.attn1(
110
+ norm_hidden_states,
111
+ encoder_hidden_states=encoder_hidden_states if self.only_cross_attention else None,
112
+ attention_mask=attention_mask,
113
+ **cross_attention_kwargs,
114
+ )
115
+ if self.use_ada_layer_norm_zero:
116
+ attn_output = gate_msa.unsqueeze(1) * attn_output
117
+ elif self.use_ada_layer_norm_single:
118
+ attn_output = gate_msa * attn_output
119
+
120
+ hidden_states = attn_output + hidden_states
121
+ if hidden_states.ndim == 4:
122
+ hidden_states = hidden_states.squeeze(1)
123
+
124
+ # 2.5 GLIGEN Control
125
+ if gligen_kwargs is not None:
126
+ hidden_states = self.fuser(hidden_states, gligen_kwargs["objs"])
127
+
128
+ hidden_states = rearrange(hidden_states, "b (v n) d -> (b v) n d", v=num_views)
129
+
130
+ # 3. Cross-Attention
131
+ if self.attn2 is not None:
132
+ if self.use_ada_layer_norm:
133
+ norm_hidden_states = self.norm2(hidden_states, timestep)
134
+ elif self.use_ada_layer_norm_zero or self.use_layer_norm:
135
+ norm_hidden_states = self.norm2(hidden_states)
136
+ elif self.use_ada_layer_norm_single:
137
+ # For PixArt norm2 isn't applied here:
138
+ # https://github.com/PixArt-alpha/PixArt-alpha/blob/0f55e922376d8b797edd44d25d0e7464b260dcab/diffusion/model/nets/PixArtMS.py#L70C1-L76C103
139
+ norm_hidden_states = hidden_states
140
+ elif self.use_ada_layer_norm_continuous:
141
+ norm_hidden_states = self.norm2(hidden_states, added_cond_kwargs["pooled_text_emb"])
142
+ else:
143
+ raise ValueError("Incorrect norm")
144
+
145
+ if self.pos_embed is not None and self.use_ada_layer_norm_single is False:
146
+ norm_hidden_states = self.pos_embed(norm_hidden_states)
147
+
148
+ attn_output = self.attn2(
149
+ norm_hidden_states,
150
+ encoder_hidden_states=encoder_hidden_states,
151
+ attention_mask=encoder_attention_mask,
152
+ **cross_attention_kwargs,
153
+ )
154
+ hidden_states = attn_output + hidden_states
155
+
156
+ # 4. Feed-forward
157
+ if self.use_ada_layer_norm_continuous:
158
+ norm_hidden_states = self.norm3(hidden_states, added_cond_kwargs["pooled_text_emb"])
159
+ elif not self.use_ada_layer_norm_single:
160
+ norm_hidden_states = self.norm3(hidden_states)
161
+
162
+ if self.use_ada_layer_norm_zero:
163
+ norm_hidden_states = norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
164
+
165
+ if self.use_ada_layer_norm_single:
166
+ norm_hidden_states = self.norm2(hidden_states)
167
+ norm_hidden_states = norm_hidden_states * (1 + scale_mlp) + shift_mlp
168
+
169
+ if self._chunk_size is not None:
170
+ # "feed_forward_chunk_size" can be used to save memory
171
+ ff_output = _chunked_feed_forward(
172
+ self.ff, norm_hidden_states, self._chunk_dim, self._chunk_size, lora_scale=lora_scale
173
+ )
174
+ else:
175
+ ff_output = self.ff(norm_hidden_states, scale=lora_scale)
176
+
177
+ if self.use_ada_layer_norm_zero:
178
+ ff_output = gate_mlp.unsqueeze(1) * ff_output
179
+ elif self.use_ada_layer_norm_single:
180
+ ff_output = gate_mlp * ff_output
181
+
182
+ hidden_states = ff_output + hidden_states
183
+ if hidden_states.ndim == 4:
184
+ hidden_states = hidden_states.squeeze(1)
185
+
186
+ return hidden_states
187
+
188
+ # Monkey-patch the class
189
+ BasicTransformerBlock.forward = new_forward
190
+
191
+
192
+ @dataclass
193
+ class UNet2DConditionOutput(BaseOutput):
194
+ """
195
+ The output of [`UNet2DConditionModel`].
196
+
197
+ Args:
198
+ sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
199
+ The hidden states output conditioned on `encoder_hidden_states` input. Output of last layer of model.
200
+ """
201
+
202
+ sample: torch.FloatTensor = None
203
+
204
+
205
+ class UNet2DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin):
206
+ r"""
207
+ A conditional 2D UNet model that takes a noisy sample, conditional state, and a timestep and returns a sample
208
+ shaped output.
209
+
210
+ This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
211
+ for all models (such as downloading or saving).
212
+
213
+ Parameters:
214
+ sample_size (`int` or `Tuple[int, int]`, *optional*, defaults to `None`):
215
+ Height and width of input/output sample.
216
+ in_channels (`int`, *optional*, defaults to 4): Number of channels in the input sample.
217
+ out_channels (`int`, *optional*, defaults to 4): Number of channels in the output.
218
+ center_input_sample (`bool`, *optional*, defaults to `False`): Whether to center the input sample.
219
+ flip_sin_to_cos (`bool`, *optional*, defaults to `False`):
220
+ Whether to flip the sin to cos in the time embedding.
221
+ freq_shift (`int`, *optional*, defaults to 0): The frequency shift to apply to the time embedding.
222
+ down_block_types (`Tuple[str]`, *optional*, defaults to `("CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D")`):
223
+ The tuple of downsample blocks to use.
224
+ mid_block_type (`str`, *optional*, defaults to `"UNetMidBlock2DCrossAttn"`):
225
+ Block type for middle of UNet, it can be one of `UNetMidBlock2DCrossAttn`, `UNetMidBlock2D`, or
226
+ `UNetMidBlock2DSimpleCrossAttn`. If `None`, the mid block layer is skipped.
227
+ up_block_types (`Tuple[str]`, *optional*, defaults to `("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D")`):
228
+ The tuple of upsample blocks to use.
229
+ only_cross_attention(`bool` or `Tuple[bool]`, *optional*, default to `False`):
230
+ Whether to include self-attention in the basic transformer blocks, see
231
+ [`~models.attention.BasicTransformerBlock`].
232
+ block_out_channels (`Tuple[int]`, *optional*, defaults to `(320, 640, 1280, 1280)`):
233
+ The tuple of output channels for each block.
234
+ layers_per_block (`int`, *optional*, defaults to 2): The number of layers per block.
235
+ downsample_padding (`int`, *optional*, defaults to 1): The padding to use for the downsampling convolution.
236
+ mid_block_scale_factor (`float`, *optional*, defaults to 1.0): The scale factor to use for the mid block.
237
+ dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
238
+ act_fn (`str`, *optional*, defaults to `"silu"`): The activation function to use.
239
+ norm_num_groups (`int`, *optional*, defaults to 32): The number of groups to use for the normalization.
240
+ If `None`, normalization and activation layers is skipped in post-processing.
241
+ norm_eps (`float`, *optional*, defaults to 1e-5): The epsilon to use for the normalization.
242
+ cross_attention_dim (`int` or `Tuple[int]`, *optional*, defaults to 1280):
243
+ The dimension of the cross attention features.
244
+ transformer_layers_per_block (`int`, `Tuple[int]`, or `Tuple[Tuple]` , *optional*, defaults to 1):
245
+ The number of transformer blocks of type [`~models.attention.BasicTransformerBlock`]. Only relevant for
246
+ [`~models.unet_2d_blocks.CrossAttnDownBlock2D`], [`~models.unet_2d_blocks.CrossAttnUpBlock2D`],
247
+ [`~models.unet_2d_blocks.UNetMidBlock2DCrossAttn`].
248
+ reverse_transformer_layers_per_block : (`Tuple[Tuple]`, *optional*, defaults to None):
249
+ The number of transformer blocks of type [`~models.attention.BasicTransformerBlock`], in the upsampling
250
+ blocks of the U-Net. Only relevant if `transformer_layers_per_block` is of type `Tuple[Tuple]` and for
251
+ [`~models.unet_2d_blocks.CrossAttnDownBlock2D`], [`~models.unet_2d_blocks.CrossAttnUpBlock2D`],
252
+ [`~models.unet_2d_blocks.UNetMidBlock2DCrossAttn`].
253
+ encoder_hid_dim (`int`, *optional*, defaults to None):
254
+ If `encoder_hid_dim_type` is defined, `encoder_hidden_states` will be projected from `encoder_hid_dim`
255
+ dimension to `cross_attention_dim`.
256
+ encoder_hid_dim_type (`str`, *optional*, defaults to `None`):
257
+ If given, the `encoder_hidden_states` and potentially other embeddings are down-projected to text
258
+ embeddings of dimension `cross_attention` according to `encoder_hid_dim_type`.
259
+ attention_head_dim (`int`, *optional*, defaults to 8): The dimension of the attention heads.
260
+ num_attention_heads (`int`, *optional*):
261
+ The number of attention heads. If not defined, defaults to `attention_head_dim`
262
+ resnet_time_scale_shift (`str`, *optional*, defaults to `"default"`): Time scale shift config
263
+ for ResNet blocks (see [`~models.resnet.ResnetBlock2D`]). Choose from `default` or `scale_shift`.
264
+ class_embed_type (`str`, *optional*, defaults to `None`):
265
+ The type of class embedding to use which is ultimately summed with the time embeddings. Choose from `None`,
266
+ `"timestep"`, `"identity"`, `"projection"`, or `"simple_projection"`.
267
+ addition_embed_type (`str`, *optional*, defaults to `None`):
268
+ Configures an optional embedding which will be summed with the time embeddings. Choose from `None` or
269
+ "text". "text" will use the `TextTimeEmbedding` layer.
270
+ addition_time_embed_dim: (`int`, *optional*, defaults to `None`):
271
+ Dimension for the timestep embeddings.
272
+ num_class_embeds (`int`, *optional*, defaults to `None`):
273
+ Input dimension of the learnable embedding matrix to be projected to `time_embed_dim`, when performing
274
+ class conditioning with `class_embed_type` equal to `None`.
275
+ time_embedding_type (`str`, *optional*, defaults to `positional`):
276
+ The type of position embedding to use for timesteps. Choose from `positional` or `fourier`.
277
+ time_embedding_dim (`int`, *optional*, defaults to `None`):
278
+ An optional override for the dimension of the projected time embedding.
279
+ time_embedding_act_fn (`str`, *optional*, defaults to `None`):
280
+ Optional activation function to use only once on the time embeddings before they are passed to the rest of
281
+ the UNet. Choose from `silu`, `mish`, `gelu`, and `swish`.
282
+ timestep_post_act (`str`, *optional*, defaults to `None`):
283
+ The second activation function to use in timestep embedding. Choose from `silu`, `mish` and `gelu`.
284
+ time_cond_proj_dim (`int`, *optional*, defaults to `None`):
285
+ The dimension of `cond_proj` layer in the timestep embedding.
286
+ conv_in_kernel (`int`, *optional*, default to `3`): The kernel size of `conv_in` layer. conv_out_kernel (`int`,
287
+ *optional*, default to `3`): The kernel size of `conv_out` layer. projection_class_embeddings_input_dim (`int`,
288
+ *optional*): The dimension of the `class_labels` input when
289
+ `class_embed_type="projection"`. Required when `class_embed_type="projection"`.
290
+ class_embeddings_concat (`bool`, *optional*, defaults to `False`): Whether to concatenate the time
291
+ embeddings with the class embeddings.
292
+ mid_block_only_cross_attention (`bool`, *optional*, defaults to `None`):
293
+ Whether to use cross attention with the mid block when using the `UNetMidBlock2DSimpleCrossAttn`. If
294
+ `only_cross_attention` is given as a single boolean and `mid_block_only_cross_attention` is `None`, the
295
+ `only_cross_attention` value is used as the value for `mid_block_only_cross_attention`. Default to `False`
296
+ otherwise.
297
+ """
298
+
299
+ _supports_gradient_checkpointing = True
300
+
301
+ @register_to_config
302
+ def __init__(
303
+ self,
304
+ sample_size: Optional[int] = None,
305
+ in_channels: int = 4,
306
+ out_channels: int = 4,
307
+ center_input_sample: bool = False,
308
+ flip_sin_to_cos: bool = True,
309
+ freq_shift: int = 0,
310
+ down_block_types: Tuple[str] = (
311
+ "CrossAttnDownBlock2D",
312
+ "CrossAttnDownBlock2D",
313
+ "CrossAttnDownBlock2D",
314
+ "DownBlock2D",
315
+ ),
316
+ mid_block_type: Optional[str] = "UNetMidBlock2DCrossAttn",
317
+ up_block_types: Tuple[str] = ("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D"),
318
+ only_cross_attention: Union[bool, Tuple[bool]] = False,
319
+ block_out_channels: Tuple[int] = (320, 640, 1280, 1280),
320
+ layers_per_block: Union[int, Tuple[int]] = 2,
321
+ downsample_padding: int = 1,
322
+ mid_block_scale_factor: float = 1,
323
+ dropout: float = 0.0,
324
+ act_fn: str = "silu",
325
+ norm_num_groups: Optional[int] = 32,
326
+ norm_eps: float = 1e-5,
327
+ cross_attention_dim: Union[int, Tuple[int]] = 1280,
328
+ transformer_layers_per_block: Union[int, Tuple[int], Tuple[Tuple]] = 1,
329
+ reverse_transformer_layers_per_block: Optional[Tuple[Tuple[int]]] = None,
330
+ encoder_hid_dim: Optional[int] = None,
331
+ encoder_hid_dim_type: Optional[str] = None,
332
+ attention_head_dim: Union[int, Tuple[int]] = 8,
333
+ num_attention_heads: Optional[Union[int, Tuple[int]]] = None,
334
+ dual_cross_attention: bool = False,
335
+ use_linear_projection: bool = False,
336
+ class_embed_type: Optional[str] = None,
337
+ addition_embed_type: Optional[str] = None,
338
+ addition_time_embed_dim: Optional[int] = None,
339
+ num_class_embeds: Optional[int] = None,
340
+ upcast_attention: bool = False,
341
+ resnet_time_scale_shift: str = "default",
342
+ resnet_skip_time_act: bool = False,
343
+ resnet_out_scale_factor: int = 1.0,
344
+ time_embedding_type: str = "positional",
345
+ time_embedding_dim: Optional[int] = None,
346
+ time_embedding_act_fn: Optional[str] = None,
347
+ timestep_post_act: Optional[str] = None,
348
+ time_cond_proj_dim: Optional[int] = None,
349
+ conv_in_kernel: int = 3,
350
+ conv_out_kernel: int = 3,
351
+ projection_class_embeddings_input_dim: Optional[int] = None,
352
+ attention_type: str = "default",
353
+ class_embeddings_concat: bool = False,
354
+ mid_block_only_cross_attention: Optional[bool] = None,
355
+ cross_attention_norm: Optional[str] = None,
356
+ addition_embed_type_num_heads=64,
357
+ ):
358
+ super().__init__()
359
+
360
+ self.sample_size = sample_size
361
+
362
+ if num_attention_heads is not None:
363
+ raise ValueError(
364
+ "At the moment it is not possible to define the number of attention heads via `num_attention_heads` because of a naming issue as described in https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131. Passing `num_attention_heads` will only be supported in diffusers v0.19."
365
+ )
366
+
367
+ # If `num_attention_heads` is not defined (which is the case for most models)
368
+ # it will default to `attention_head_dim`. This looks weird upon first reading it and it is.
369
+ # The reason for this behavior is to correct for incorrectly named variables that were introduced
370
+ # when this library was created. The incorrect naming was only discovered much later in https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131
371
+ # Changing `attention_head_dim` to `num_attention_heads` for 40,000+ configurations is too backwards breaking
372
+ # which is why we correct for the naming here.
373
+ num_attention_heads = num_attention_heads or attention_head_dim
374
+
375
+ # Check inputs
376
+ if len(down_block_types) != len(up_block_types):
377
+ raise ValueError(
378
+ f"Must provide the same number of `down_block_types` as `up_block_types`. `down_block_types`: {down_block_types}. `up_block_types`: {up_block_types}."
379
+ )
380
+
381
+ if len(block_out_channels) != len(down_block_types):
382
+ raise ValueError(
383
+ f"Must provide the same number of `block_out_channels` as `down_block_types`. `block_out_channels`: {block_out_channels}. `down_block_types`: {down_block_types}."
384
+ )
385
+
386
+ if not isinstance(only_cross_attention, bool) and len(only_cross_attention) != len(down_block_types):
387
+ raise ValueError(
388
+ f"Must provide the same number of `only_cross_attention` as `down_block_types`. `only_cross_attention`: {only_cross_attention}. `down_block_types`: {down_block_types}."
389
+ )
390
+
391
+ if not isinstance(num_attention_heads, int) and len(num_attention_heads) != len(down_block_types):
392
+ raise ValueError(
393
+ f"Must provide the same number of `num_attention_heads` as `down_block_types`. `num_attention_heads`: {num_attention_heads}. `down_block_types`: {down_block_types}."
394
+ )
395
+
396
+ if not isinstance(attention_head_dim, int) and len(attention_head_dim) != len(down_block_types):
397
+ raise ValueError(
398
+ f"Must provide the same number of `attention_head_dim` as `down_block_types`. `attention_head_dim`: {attention_head_dim}. `down_block_types`: {down_block_types}."
399
+ )
400
+
401
+ if isinstance(cross_attention_dim, list) and len(cross_attention_dim) != len(down_block_types):
402
+ raise ValueError(
403
+ f"Must provide the same number of `cross_attention_dim` as `down_block_types`. `cross_attention_dim`: {cross_attention_dim}. `down_block_types`: {down_block_types}."
404
+ )
405
+
406
+ if not isinstance(layers_per_block, int) and len(layers_per_block) != len(down_block_types):
407
+ raise ValueError(
408
+ f"Must provide the same number of `layers_per_block` as `down_block_types`. `layers_per_block`: {layers_per_block}. `down_block_types`: {down_block_types}."
409
+ )
410
+ if isinstance(transformer_layers_per_block, list) and reverse_transformer_layers_per_block is None:
411
+ for layer_number_per_block in transformer_layers_per_block:
412
+ if isinstance(layer_number_per_block, list):
413
+ raise ValueError("Must provide 'reverse_transformer_layers_per_block` if using asymmetrical UNet.")
414
+
415
+ # input
416
+ conv_in_padding = (conv_in_kernel - 1) // 2
417
+ self.conv_in = nn.Conv2d(
418
+ in_channels, block_out_channels[0], kernel_size=conv_in_kernel, padding=conv_in_padding
419
+ )
420
+
421
+ # time
422
+ if time_embedding_type == "fourier":
423
+ time_embed_dim = time_embedding_dim or block_out_channels[0] * 2
424
+ if time_embed_dim % 2 != 0:
425
+ raise ValueError(f"`time_embed_dim` should be divisible by 2, but is {time_embed_dim}.")
426
+ self.time_proj = GaussianFourierProjection(
427
+ time_embed_dim // 2, set_W_to_weight=False, log=False, flip_sin_to_cos=flip_sin_to_cos
428
+ )
429
+ timestep_input_dim = time_embed_dim
430
+ elif time_embedding_type == "positional":
431
+ time_embed_dim = time_embedding_dim or block_out_channels[0] * 4
432
+
433
+ self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift)
434
+ timestep_input_dim = block_out_channels[0]
435
+ else:
436
+ raise ValueError(
437
+ f"{time_embedding_type} does not exist. Please make sure to use one of `fourier` or `positional`."
438
+ )
439
+
440
+ self.time_embedding = TimestepEmbedding(
441
+ timestep_input_dim,
442
+ time_embed_dim,
443
+ act_fn=act_fn,
444
+ post_act_fn=timestep_post_act,
445
+ cond_proj_dim=time_cond_proj_dim,
446
+ )
447
+
448
+ if encoder_hid_dim_type is None and encoder_hid_dim is not None:
449
+ encoder_hid_dim_type = "text_proj"
450
+ self.register_to_config(encoder_hid_dim_type=encoder_hid_dim_type)
451
+ logger.info("encoder_hid_dim_type defaults to 'text_proj' as `encoder_hid_dim` is defined.")
452
+
453
+ if encoder_hid_dim is None and encoder_hid_dim_type is not None:
454
+ raise ValueError(
455
+ f"`encoder_hid_dim` has to be defined when `encoder_hid_dim_type` is set to {encoder_hid_dim_type}."
456
+ )
457
+
458
+ if encoder_hid_dim_type == "text_proj":
459
+ self.encoder_hid_proj = nn.Linear(encoder_hid_dim, cross_attention_dim)
460
+ elif encoder_hid_dim_type == "text_image_proj":
461
+ # image_embed_dim DOESN'T have to be `cross_attention_dim`. To not clutter the __init__ too much
462
+ # they are set to `cross_attention_dim` here as this is exactly the required dimension for the currently only use
463
+ # case when `addition_embed_type == "text_image_proj"` (Kadinsky 2.1)`
464
+ self.encoder_hid_proj = TextImageProjection(
465
+ text_embed_dim=encoder_hid_dim,
466
+ image_embed_dim=cross_attention_dim,
467
+ cross_attention_dim=cross_attention_dim,
468
+ )
469
+ elif encoder_hid_dim_type == "image_proj":
470
+ # Kandinsky 2.2
471
+ self.encoder_hid_proj = ImageProjection(
472
+ image_embed_dim=encoder_hid_dim,
473
+ cross_attention_dim=cross_attention_dim,
474
+ )
475
+ elif encoder_hid_dim_type is not None:
476
+ raise ValueError(
477
+ f"encoder_hid_dim_type: {encoder_hid_dim_type} must be None, 'text_proj' or 'text_image_proj'."
478
+ )
479
+ else:
480
+ self.encoder_hid_proj = None
481
+
482
+ # class embedding
483
+ if class_embed_type is None and num_class_embeds is not None:
484
+ self.class_embedding = nn.Embedding(num_class_embeds, time_embed_dim)
485
+ elif class_embed_type == "timestep":
486
+ self.class_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim, act_fn=act_fn)
487
+ elif class_embed_type == "identity":
488
+ self.class_embedding = nn.Identity(time_embed_dim, time_embed_dim)
489
+ elif class_embed_type == "projection":
490
+ if projection_class_embeddings_input_dim is None:
491
+ raise ValueError(
492
+ "`class_embed_type`: 'projection' requires `projection_class_embeddings_input_dim` be set"
493
+ )
494
+ # The projection `class_embed_type` is the same as the timestep `class_embed_type` except
495
+ # 1. the `class_labels` inputs are not first converted to sinusoidal embeddings
496
+ # 2. it projects from an arbitrary input dimension.
497
+ #
498
+ # Note that `TimestepEmbedding` is quite general, being mainly linear layers and activations.
499
+ # When used for embedding actual timesteps, the timesteps are first converted to sinusoidal embeddings.
500
+ # As a result, `TimestepEmbedding` can be passed arbitrary vectors.
501
+ self.class_embedding = TimestepEmbedding(projection_class_embeddings_input_dim, time_embed_dim)
502
+ elif class_embed_type == "simple_projection":
503
+ if projection_class_embeddings_input_dim is None:
504
+ raise ValueError(
505
+ "`class_embed_type`: 'simple_projection' requires `projection_class_embeddings_input_dim` be set"
506
+ )
507
+ self.class_embedding = nn.Linear(projection_class_embeddings_input_dim, time_embed_dim)
508
+ else:
509
+ self.class_embedding = None
510
+
511
+ if addition_embed_type == "text":
512
+ if encoder_hid_dim is not None:
513
+ text_time_embedding_from_dim = encoder_hid_dim
514
+ else:
515
+ text_time_embedding_from_dim = cross_attention_dim
516
+
517
+ self.add_embedding = TextTimeEmbedding(
518
+ text_time_embedding_from_dim, time_embed_dim, num_heads=addition_embed_type_num_heads
519
+ )
520
+ elif addition_embed_type == "text_image":
521
+ # text_embed_dim and image_embed_dim DON'T have to be `cross_attention_dim`. To not clutter the __init__ too much
522
+ # they are set to `cross_attention_dim` here as this is exactly the required dimension for the currently only use
523
+ # case when `addition_embed_type == "text_image"` (Kadinsky 2.1)`
524
+ self.add_embedding = TextImageTimeEmbedding(
525
+ text_embed_dim=cross_attention_dim, image_embed_dim=cross_attention_dim, time_embed_dim=time_embed_dim
526
+ )
527
+ elif addition_embed_type == "text_time":
528
+ self.add_time_proj = Timesteps(addition_time_embed_dim, flip_sin_to_cos, freq_shift)
529
+ self.add_embedding = TimestepEmbedding(projection_class_embeddings_input_dim, time_embed_dim)
530
+ elif addition_embed_type == "image":
531
+ # Kandinsky 2.2
532
+ self.add_embedding = ImageTimeEmbedding(image_embed_dim=encoder_hid_dim, time_embed_dim=time_embed_dim)
533
+ elif addition_embed_type == "image_hint":
534
+ # Kandinsky 2.2 ControlNet
535
+ self.add_embedding = ImageHintTimeEmbedding(image_embed_dim=encoder_hid_dim, time_embed_dim=time_embed_dim)
536
+ elif addition_embed_type is not None:
537
+ raise ValueError(f"addition_embed_type: {addition_embed_type} must be None, 'text' or 'text_image'.")
538
+
539
+ if time_embedding_act_fn is None:
540
+ self.time_embed_act = None
541
+ else:
542
+ self.time_embed_act = get_activation(time_embedding_act_fn)
543
+
544
+ self.down_blocks = nn.ModuleList([])
545
+ self.up_blocks = nn.ModuleList([])
546
+
547
+ if isinstance(only_cross_attention, bool):
548
+ if mid_block_only_cross_attention is None:
549
+ mid_block_only_cross_attention = only_cross_attention
550
+
551
+ only_cross_attention = [only_cross_attention] * len(down_block_types)
552
+
553
+ if mid_block_only_cross_attention is None:
554
+ mid_block_only_cross_attention = False
555
+
556
+ if isinstance(num_attention_heads, int):
557
+ num_attention_heads = (num_attention_heads,) * len(down_block_types)
558
+
559
+ if isinstance(attention_head_dim, int):
560
+ attention_head_dim = (attention_head_dim,) * len(down_block_types)
561
+
562
+ if isinstance(cross_attention_dim, int):
563
+ cross_attention_dim = (cross_attention_dim,) * len(down_block_types)
564
+
565
+ if isinstance(layers_per_block, int):
566
+ layers_per_block = [layers_per_block] * len(down_block_types)
567
+
568
+ if isinstance(transformer_layers_per_block, int):
569
+ transformer_layers_per_block = [transformer_layers_per_block] * len(down_block_types)
570
+
571
+ if class_embeddings_concat:
572
+ # The time embeddings are concatenated with the class embeddings. The dimension of the
573
+ # time embeddings passed to the down, middle, and up blocks is twice the dimension of the
574
+ # regular time embeddings
575
+ blocks_time_embed_dim = time_embed_dim * 2
576
+ else:
577
+ blocks_time_embed_dim = time_embed_dim
578
+
579
+ # down
580
+ output_channel = block_out_channels[0]
581
+ for i, down_block_type in enumerate(down_block_types):
582
+ input_channel = output_channel
583
+ output_channel = block_out_channels[i]
584
+ is_final_block = i == len(block_out_channels) - 1
585
+
586
+ down_block = get_down_block(
587
+ down_block_type,
588
+ num_layers=layers_per_block[i],
589
+ transformer_layers_per_block=transformer_layers_per_block[i],
590
+ in_channels=input_channel,
591
+ out_channels=output_channel,
592
+ temb_channels=blocks_time_embed_dim,
593
+ add_downsample=not is_final_block,
594
+ resnet_eps=norm_eps,
595
+ resnet_act_fn=act_fn,
596
+ resnet_groups=norm_num_groups,
597
+ cross_attention_dim=cross_attention_dim[i],
598
+ num_attention_heads=num_attention_heads[i],
599
+ downsample_padding=downsample_padding,
600
+ dual_cross_attention=dual_cross_attention,
601
+ use_linear_projection=use_linear_projection,
602
+ only_cross_attention=only_cross_attention[i],
603
+ upcast_attention=upcast_attention,
604
+ resnet_time_scale_shift=resnet_time_scale_shift,
605
+ attention_type=attention_type,
606
+ resnet_skip_time_act=resnet_skip_time_act,
607
+ resnet_out_scale_factor=resnet_out_scale_factor,
608
+ cross_attention_norm=cross_attention_norm,
609
+ attention_head_dim=attention_head_dim[i] if attention_head_dim[i] is not None else output_channel,
610
+ dropout=dropout,
611
+ )
612
+ self.down_blocks.append(down_block)
613
+
614
+ # mid
615
+ if mid_block_type == "UNetMidBlock2DCrossAttn":
616
+ self.mid_block = UNetMidBlock2DCrossAttn(
617
+ transformer_layers_per_block=transformer_layers_per_block[-1],
618
+ in_channels=block_out_channels[-1],
619
+ temb_channels=blocks_time_embed_dim,
620
+ dropout=dropout,
621
+ resnet_eps=norm_eps,
622
+ resnet_act_fn=act_fn,
623
+ output_scale_factor=mid_block_scale_factor,
624
+ resnet_time_scale_shift=resnet_time_scale_shift,
625
+ cross_attention_dim=cross_attention_dim[-1],
626
+ num_attention_heads=num_attention_heads[-1],
627
+ resnet_groups=norm_num_groups,
628
+ dual_cross_attention=dual_cross_attention,
629
+ use_linear_projection=use_linear_projection,
630
+ upcast_attention=upcast_attention,
631
+ attention_type=attention_type,
632
+ )
633
+ elif mid_block_type == "UNetMidBlock2DSimpleCrossAttn":
634
+ self.mid_block = UNetMidBlock2DSimpleCrossAttn(
635
+ in_channels=block_out_channels[-1],
636
+ temb_channels=blocks_time_embed_dim,
637
+ dropout=dropout,
638
+ resnet_eps=norm_eps,
639
+ resnet_act_fn=act_fn,
640
+ output_scale_factor=mid_block_scale_factor,
641
+ cross_attention_dim=cross_attention_dim[-1],
642
+ attention_head_dim=attention_head_dim[-1],
643
+ resnet_groups=norm_num_groups,
644
+ resnet_time_scale_shift=resnet_time_scale_shift,
645
+ skip_time_act=resnet_skip_time_act,
646
+ only_cross_attention=mid_block_only_cross_attention,
647
+ cross_attention_norm=cross_attention_norm,
648
+ )
649
+ elif mid_block_type == "UNetMidBlock2D":
650
+ self.mid_block = UNetMidBlock2D(
651
+ in_channels=block_out_channels[-1],
652
+ temb_channels=blocks_time_embed_dim,
653
+ dropout=dropout,
654
+ num_layers=0,
655
+ resnet_eps=norm_eps,
656
+ resnet_act_fn=act_fn,
657
+ output_scale_factor=mid_block_scale_factor,
658
+ resnet_groups=norm_num_groups,
659
+ resnet_time_scale_shift=resnet_time_scale_shift,
660
+ add_attention=False,
661
+ )
662
+ elif mid_block_type is None:
663
+ self.mid_block = None
664
+ else:
665
+ raise ValueError(f"unknown mid_block_type : {mid_block_type}")
666
+
667
+ # count how many layers upsample the images
668
+ self.num_upsamplers = 0
669
+
670
+ # up
671
+ reversed_block_out_channels = list(reversed(block_out_channels))
672
+ reversed_num_attention_heads = list(reversed(num_attention_heads))
673
+ reversed_layers_per_block = list(reversed(layers_per_block))
674
+ reversed_cross_attention_dim = list(reversed(cross_attention_dim))
675
+ reversed_transformer_layers_per_block = (
676
+ list(reversed(transformer_layers_per_block))
677
+ if reverse_transformer_layers_per_block is None
678
+ else reverse_transformer_layers_per_block
679
+ )
680
+ only_cross_attention = list(reversed(only_cross_attention))
681
+
682
+ output_channel = reversed_block_out_channels[0]
683
+ for i, up_block_type in enumerate(up_block_types):
684
+ is_final_block = i == len(block_out_channels) - 1
685
+
686
+ prev_output_channel = output_channel
687
+ output_channel = reversed_block_out_channels[i]
688
+ input_channel = reversed_block_out_channels[min(i + 1, len(block_out_channels) - 1)]
689
+
690
+ # add upsample block for all BUT final layer
691
+ if not is_final_block:
692
+ add_upsample = True
693
+ self.num_upsamplers += 1
694
+ else:
695
+ add_upsample = False
696
+
697
+ up_block = get_up_block(
698
+ up_block_type,
699
+ num_layers=reversed_layers_per_block[i] + 1,
700
+ transformer_layers_per_block=reversed_transformer_layers_per_block[i],
701
+ in_channels=input_channel,
702
+ out_channels=output_channel,
703
+ prev_output_channel=prev_output_channel,
704
+ temb_channels=blocks_time_embed_dim,
705
+ add_upsample=add_upsample,
706
+ resnet_eps=norm_eps,
707
+ resnet_act_fn=act_fn,
708
+ resolution_idx=i,
709
+ resnet_groups=norm_num_groups,
710
+ cross_attention_dim=reversed_cross_attention_dim[i],
711
+ num_attention_heads=reversed_num_attention_heads[i],
712
+ dual_cross_attention=dual_cross_attention,
713
+ use_linear_projection=use_linear_projection,
714
+ only_cross_attention=only_cross_attention[i],
715
+ upcast_attention=upcast_attention,
716
+ resnet_time_scale_shift=resnet_time_scale_shift,
717
+ attention_type=attention_type,
718
+ resnet_skip_time_act=resnet_skip_time_act,
719
+ resnet_out_scale_factor=resnet_out_scale_factor,
720
+ cross_attention_norm=cross_attention_norm,
721
+ attention_head_dim=attention_head_dim[i] if attention_head_dim[i] is not None else output_channel,
722
+ dropout=dropout,
723
+ )
724
+ self.up_blocks.append(up_block)
725
+ prev_output_channel = output_channel
726
+
727
+ # out
728
+ if norm_num_groups is not None:
729
+ self.conv_norm_out = nn.GroupNorm(
730
+ num_channels=block_out_channels[0], num_groups=norm_num_groups, eps=norm_eps
731
+ )
732
+
733
+ self.conv_act = get_activation(act_fn)
734
+
735
+ else:
736
+ self.conv_norm_out = None
737
+ self.conv_act = None
738
+
739
+ conv_out_padding = (conv_out_kernel - 1) // 2
740
+ self.conv_out = nn.Conv2d(
741
+ block_out_channels[0], out_channels, kernel_size=conv_out_kernel, padding=conv_out_padding
742
+ )
743
+
744
+ if attention_type in ["gated", "gated-text-image"]:
745
+ positive_len = 768
746
+ if isinstance(cross_attention_dim, int):
747
+ positive_len = cross_attention_dim
748
+ elif isinstance(cross_attention_dim, tuple) or isinstance(cross_attention_dim, list):
749
+ positive_len = cross_attention_dim[0]
750
+
751
+ feature_type = "text-only" if attention_type == "gated" else "text-image"
752
+ self.position_net = PositionNet(
753
+ positive_len=positive_len, out_dim=cross_attention_dim, feature_type=feature_type
754
+ )
755
+
756
+ @property
757
+ def attn_processors(self) -> Dict[str, AttentionProcessor]:
758
+ r"""
759
+ Returns:
760
+ `dict` of attention processors: A dictionary containing all attention processors used in the model with
761
+ indexed by its weight name.
762
+ """
763
+ # set recursively
764
+ processors = {}
765
+
766
+ def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
767
+ if hasattr(module, "get_processor"):
768
+ processors[f"{name}.processor"] = module.get_processor(return_deprecated_lora=True)
769
+
770
+ for sub_name, child in module.named_children():
771
+ fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
772
+
773
+ return processors
774
+
775
+ for name, module in self.named_children():
776
+ fn_recursive_add_processors(name, module, processors)
777
+
778
+ return processors
779
+
780
+ def set_attn_processor(
781
+ self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]], _remove_lora=False
782
+ ):
783
+ r"""
784
+ Sets the attention processor to use to compute attention.
785
+
786
+ Parameters:
787
+ processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
788
+ The instantiated processor class or a dictionary of processor classes that will be set as the processor
789
+ for **all** `Attention` layers.
790
+
791
+ If `processor` is a dict, the key needs to define the path to the corresponding cross attention
792
+ processor. This is strongly recommended when setting trainable attention processors.
793
+
794
+ """
795
+ count = len(self.attn_processors.keys())
796
+
797
+ if isinstance(processor, dict) and len(processor) != count:
798
+ raise ValueError(
799
+ f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
800
+ f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
801
+ )
802
+
803
+ def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
804
+ if hasattr(module, "set_processor"):
805
+ if not isinstance(processor, dict):
806
+ module.set_processor(processor, _remove_lora=_remove_lora)
807
+ else:
808
+ module.set_processor(processor.pop(f"{name}.processor"), _remove_lora=_remove_lora)
809
+
810
+ for sub_name, child in module.named_children():
811
+ fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
812
+
813
+ for name, module in self.named_children():
814
+ fn_recursive_attn_processor(name, module, processor)
815
+
816
+ def set_default_attn_processor(self):
817
+ """
818
+ Disables custom attention processors and sets the default attention implementation.
819
+ """
820
+ if all(proc.__class__ in ADDED_KV_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
821
+ processor = AttnAddedKVProcessor()
822
+ elif all(proc.__class__ in CROSS_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
823
+ processor = AttnProcessor()
824
+ else:
825
+ raise ValueError(
826
+ f"Cannot call `set_default_attn_processor` when attention processors are of type {next(iter(self.attn_processors.values()))}"
827
+ )
828
+
829
+ self.set_attn_processor(processor, _remove_lora=True)
830
+
831
+ def set_attention_slice(self, slice_size):
832
+ r"""
833
+ Enable sliced attention computation.
834
+
835
+ When this option is enabled, the attention module splits the input tensor in slices to compute attention in
836
+ several steps. This is useful for saving some memory in exchange for a small decrease in speed.
837
+
838
+ Args:
839
+ slice_size (`str` or `int` or `list(int)`, *optional*, defaults to `"auto"`):
840
+ When `"auto"`, input to the attention heads is halved, so attention is computed in two steps. If
841
+ `"max"`, maximum amount of memory is saved by running only one slice at a time. If a number is
842
+ provided, uses as many slices as `attention_head_dim // slice_size`. In this case, `attention_head_dim`
843
+ must be a multiple of `slice_size`.
844
+ """
845
+ sliceable_head_dims = []
846
+
847
+ def fn_recursive_retrieve_sliceable_dims(module: torch.nn.Module):
848
+ if hasattr(module, "set_attention_slice"):
849
+ sliceable_head_dims.append(module.sliceable_head_dim)
850
+
851
+ for child in module.children():
852
+ fn_recursive_retrieve_sliceable_dims(child)
853
+
854
+ # retrieve number of attention layers
855
+ for module in self.children():
856
+ fn_recursive_retrieve_sliceable_dims(module)
857
+
858
+ num_sliceable_layers = len(sliceable_head_dims)
859
+
860
+ if slice_size == "auto":
861
+ # half the attention head size is usually a good trade-off between
862
+ # speed and memory
863
+ slice_size = [dim // 2 for dim in sliceable_head_dims]
864
+ elif slice_size == "max":
865
+ # make smallest slice possible
866
+ slice_size = num_sliceable_layers * [1]
867
+
868
+ slice_size = num_sliceable_layers * [slice_size] if not isinstance(slice_size, list) else slice_size
869
+
870
+ if len(slice_size) != len(sliceable_head_dims):
871
+ raise ValueError(
872
+ f"You have provided {len(slice_size)}, but {self.config} has {len(sliceable_head_dims)} different"
873
+ f" attention layers. Make sure to match `len(slice_size)` to be {len(sliceable_head_dims)}."
874
+ )
875
+
876
+ for i in range(len(slice_size)):
877
+ size = slice_size[i]
878
+ dim = sliceable_head_dims[i]
879
+ if size is not None and size > dim:
880
+ raise ValueError(f"size {size} has to be smaller or equal to {dim}.")
881
+
882
+ # Recursively walk through all the children.
883
+ # Any children which exposes the set_attention_slice method
884
+ # gets the message
885
+ def fn_recursive_set_attention_slice(module: torch.nn.Module, slice_size: List[int]):
886
+ if hasattr(module, "set_attention_slice"):
887
+ module.set_attention_slice(slice_size.pop())
888
+
889
+ for child in module.children():
890
+ fn_recursive_set_attention_slice(child, slice_size)
891
+
892
+ reversed_slice_size = list(reversed(slice_size))
893
+ for module in self.children():
894
+ fn_recursive_set_attention_slice(module, reversed_slice_size)
895
+
896
+ def _set_gradient_checkpointing(self, module, value=False):
897
+ if hasattr(module, "gradient_checkpointing"):
898
+ module.gradient_checkpointing = value
899
+
900
+ def enable_freeu(self, s1, s2, b1, b2):
901
+ r"""Enables the FreeU mechanism from https://arxiv.org/abs/2309.11497.
902
+
903
+ The suffixes after the scaling factors represent the stage blocks where they are being applied.
904
+
905
+ Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of values that
906
+ are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.
907
+
908
+ Args:
909
+ s1 (`float`):
910
+ Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
911
+ mitigate the "oversmoothing effect" in the enhanced denoising process.
912
+ s2 (`float`):
913
+ Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to
914
+ mitigate the "oversmoothing effect" in the enhanced denoising process.
915
+ b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features.
916
+ b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features.
917
+ """
918
+ for i, upsample_block in enumerate(self.up_blocks):
919
+ setattr(upsample_block, "s1", s1)
920
+ setattr(upsample_block, "s2", s2)
921
+ setattr(upsample_block, "b1", b1)
922
+ setattr(upsample_block, "b2", b2)
923
+
924
+ def disable_freeu(self):
925
+ """Disables the FreeU mechanism."""
926
+ freeu_keys = {"s1", "s2", "b1", "b2"}
927
+ for i, upsample_block in enumerate(self.up_blocks):
928
+ for k in freeu_keys:
929
+ if hasattr(upsample_block, k) or getattr(upsample_block, k, None) is not None:
930
+ setattr(upsample_block, k, None)
931
+
932
+ def fuse_qkv_projections(self):
933
+ """
934
+ Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query,
935
+ key, value) are fused. For cross-attention modules, key and value projection matrices are fused.
936
+
937
+ <Tip warning={true}>
938
+
939
+ This API is 🧪 experimental.
940
+
941
+ </Tip>
942
+ """
943
+ self.original_attn_processors = None
944
+
945
+ for _, attn_processor in self.attn_processors.items():
946
+ if "Added" in str(attn_processor.__class__.__name__):
947
+ raise ValueError("`fuse_qkv_projections()` is not supported for models having added KV projections.")
948
+
949
+ self.original_attn_processors = self.attn_processors
950
+
951
+ for module in self.modules():
952
+ if isinstance(module, Attention):
953
+ module.fuse_projections(fuse=True)
954
+
955
+ def unfuse_qkv_projections(self):
956
+ """Disables the fused QKV projection if enabled.
957
+
958
+ <Tip warning={true}>
959
+
960
+ This API is 🧪 experimental.
961
+
962
+ </Tip>
963
+
964
+ """
965
+ if self.original_attn_processors is not None:
966
+ self.set_attn_processor(self.original_attn_processors)
967
+
968
+ def forward(
969
+ self,
970
+ sample: torch.FloatTensor,
971
+ timestep: Union[torch.Tensor, float, int],
972
+ encoder_hidden_states: torch.Tensor,
973
+ class_labels: Optional[torch.Tensor] = None,
974
+ timestep_cond: Optional[torch.Tensor] = None,
975
+ attention_mask: Optional[torch.Tensor] = None,
976
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
977
+ added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None,
978
+ down_block_additional_residuals: Optional[Tuple[torch.Tensor]] = None,
979
+ mid_block_additional_residual: Optional[torch.Tensor] = None,
980
+ down_intrablock_additional_residuals: Optional[Tuple[torch.Tensor]] = None,
981
+ encoder_attention_mask: Optional[torch.Tensor] = None,
982
+ return_dict: bool = True,
983
+ ) -> Union[UNet2DConditionOutput, Tuple]:
984
+ r"""
985
+ The [`UNet2DConditionModel`] forward method.
986
+
987
+ Args:
988
+ sample (`torch.FloatTensor`):
989
+ The noisy input tensor with the following shape `(batch, channel, height, width)`.
990
+ timestep (`torch.FloatTensor` or `float` or `int`): The number of timesteps to denoise an input.
991
+ encoder_hidden_states (`torch.FloatTensor`):
992
+ The encoder hidden states with shape `(batch, sequence_length, feature_dim)`.
993
+ class_labels (`torch.Tensor`, *optional*, defaults to `None`):
994
+ Optional class labels for conditioning. Their embeddings will be summed with the timestep embeddings.
995
+ timestep_cond: (`torch.Tensor`, *optional*, defaults to `None`):
996
+ Conditional embeddings for timestep. If provided, the embeddings will be summed with the samples passed
997
+ through the `self.time_embedding` layer to obtain the timestep embeddings.
998
+ attention_mask (`torch.Tensor`, *optional*, defaults to `None`):
999
+ An attention mask of shape `(batch, key_tokens)` is applied to `encoder_hidden_states`. If `1` the mask
1000
+ is kept, otherwise if `0` it is discarded. Mask will be converted into a bias, which adds large
1001
+ negative values to the attention scores corresponding to "discard" tokens.
1002
+ cross_attention_kwargs (`dict`, *optional*):
1003
+ A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
1004
+ `self.processor` in
1005
+ [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
1006
+ added_cond_kwargs: (`dict`, *optional*):
1007
+ A kwargs dictionary containing additional embeddings that if specified are added to the embeddings that
1008
+ are passed along to the UNet blocks.
1009
+ down_block_additional_residuals: (`tuple` of `torch.Tensor`, *optional*):
1010
+ A tuple of tensors that if specified are added to the residuals of down unet blocks.
1011
+ mid_block_additional_residual: (`torch.Tensor`, *optional*):
1012
+ A tensor that if specified is added to the residual of the middle unet block.
1013
+ encoder_attention_mask (`torch.Tensor`):
1014
+ A cross-attention mask of shape `(batch, sequence_length)` is applied to `encoder_hidden_states`. If
1015
+ `True` the mask is kept, otherwise if `False` it is discarded. Mask will be converted into a bias,
1016
+ which adds large negative values to the attention scores corresponding to "discard" tokens.
1017
+ return_dict (`bool`, *optional*, defaults to `True`):
1018
+ Whether or not to return a [`~models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain
1019
+ tuple.
1020
+ cross_attention_kwargs (`dict`, *optional*):
1021
+ A kwargs dictionary that if specified is passed along to the [`AttnProcessor`].
1022
+ added_cond_kwargs: (`dict`, *optional*):
1023
+ A kwargs dictionary containin additional embeddings that if specified are added to the embeddings that
1024
+ are passed along to the UNet blocks.
1025
+ down_block_additional_residuals (`tuple` of `torch.Tensor`, *optional*):
1026
+ additional residuals to be added to UNet long skip connections from down blocks to up blocks for
1027
+ example from ControlNet side model(s)
1028
+ mid_block_additional_residual (`torch.Tensor`, *optional*):
1029
+ additional residual to be added to UNet mid block output, for example from ControlNet side model
1030
+ down_intrablock_additional_residuals (`tuple` of `torch.Tensor`, *optional*):
1031
+ additional residuals to be added within UNet down blocks, for example from T2I-Adapter side model(s)
1032
+
1033
+ Returns:
1034
+ [`~models.unet_2d_condition.UNet2DConditionOutput`] or `tuple`:
1035
+ If `return_dict` is True, an [`~models.unet_2d_condition.UNet2DConditionOutput`] is returned, otherwise
1036
+ a `tuple` is returned where the first element is the sample tensor.
1037
+ """
1038
+ # By default samples have to be AT least a multiple of the overall upsampling factor.
1039
+ # The overall upsampling factor is equal to 2 ** (# num of upsampling layers).
1040
+ # However, the upsampling interpolation output size can be forced to fit any upsampling size
1041
+ # on the fly if necessary.
1042
+ default_overall_up_factor = 2**self.num_upsamplers
1043
+
1044
+ # upsample size should be forwarded when sample is not a multiple of `default_overall_up_factor`
1045
+ forward_upsample_size = False
1046
+ upsample_size = None
1047
+
1048
+ for dim in sample.shape[-2:]:
1049
+ if dim % default_overall_up_factor != 0:
1050
+ # Forward upsample size to force interpolation output size.
1051
+ forward_upsample_size = True
1052
+ break
1053
+
1054
+ # ensure attention_mask is a bias, and give it a singleton query_tokens dimension
1055
+ # expects mask of shape:
1056
+ # [batch, key_tokens]
1057
+ # adds singleton query_tokens dimension:
1058
+ # [batch, 1, key_tokens]
1059
+ # this helps to broadcast it as a bias over attention scores, which will be in one of the following shapes:
1060
+ # [batch, heads, query_tokens, key_tokens] (e.g. torch sdp attn)
1061
+ # [batch * heads, query_tokens, key_tokens] (e.g. xformers or classic attn)
1062
+ if attention_mask is not None:
1063
+ # assume that mask is expressed as:
1064
+ # (1 = keep, 0 = discard)
1065
+ # convert mask into a bias that can be added to attention scores:
1066
+ # (keep = +0, discard = -10000.0)
1067
+ attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0
1068
+ attention_mask = attention_mask.unsqueeze(1)
1069
+
1070
+ # convert encoder_attention_mask to a bias the same way we do for attention_mask
1071
+ if encoder_attention_mask is not None:
1072
+ encoder_attention_mask = (1 - encoder_attention_mask.to(sample.dtype)) * -10000.0
1073
+ encoder_attention_mask = encoder_attention_mask.unsqueeze(1)
1074
+
1075
+ # 0. center input if necessary
1076
+ if self.config.center_input_sample:
1077
+ sample = 2 * sample - 1.0
1078
+
1079
+ # 1. time
1080
+ timesteps = timestep
1081
+ if not torch.is_tensor(timesteps):
1082
+ # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
1083
+ # This would be a good case for the `match` statement (Python 3.10+)
1084
+ is_mps = sample.device.type == "mps"
1085
+ if isinstance(timestep, float):
1086
+ dtype = torch.float32 if is_mps else torch.float64
1087
+ else:
1088
+ dtype = torch.int32 if is_mps else torch.int64
1089
+ timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device)
1090
+ elif len(timesteps.shape) == 0:
1091
+ timesteps = timesteps[None].to(sample.device)
1092
+
1093
+ # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
1094
+ timesteps = timesteps.expand(sample.shape[0])
1095
+
1096
+ t_emb = self.time_proj(timesteps)
1097
+
1098
+ # `Timesteps` does not contain any weights and will always return f32 tensors
1099
+ # but time_embedding might actually be running in fp16. so we need to cast here.
1100
+ # there might be better ways to encapsulate this.
1101
+ t_emb = t_emb.to(dtype=sample.dtype)
1102
+
1103
+ emb = self.time_embedding(t_emb, timestep_cond)
1104
+ aug_emb = None
1105
+
1106
+ if self.class_embedding is not None:
1107
+ if class_labels is None:
1108
+ raise ValueError("class_labels should be provided when num_class_embeds > 0")
1109
+
1110
+ if self.config.class_embed_type == "timestep":
1111
+ class_labels = self.time_proj(class_labels)
1112
+
1113
+ # `Timesteps` does not contain any weights and will always return f32 tensors
1114
+ # there might be better ways to encapsulate this.
1115
+ class_labels = class_labels.to(dtype=sample.dtype)
1116
+
1117
+ class_emb = self.class_embedding(class_labels).to(dtype=sample.dtype)
1118
+
1119
+ if self.config.class_embeddings_concat:
1120
+ emb = torch.cat([emb, class_emb], dim=-1)
1121
+ else:
1122
+ emb = emb + class_emb
1123
+
1124
+ if self.config.addition_embed_type == "text":
1125
+ aug_emb = self.add_embedding(encoder_hidden_states)
1126
+ elif self.config.addition_embed_type == "text_image":
1127
+ # Kandinsky 2.1 - style
1128
+ if "image_embeds" not in added_cond_kwargs:
1129
+ raise ValueError(
1130
+ f"{self.__class__} has the config param `addition_embed_type` set to 'text_image' which requires the keyword argument `image_embeds` to be passed in `added_cond_kwargs`"
1131
+ )
1132
+
1133
+ image_embs = added_cond_kwargs.get("image_embeds")
1134
+ text_embs = added_cond_kwargs.get("text_embeds", encoder_hidden_states)
1135
+ aug_emb = self.add_embedding(text_embs, image_embs)
1136
+ elif self.config.addition_embed_type == "text_time":
1137
+ # SDXL - style
1138
+ if "text_embeds" not in added_cond_kwargs:
1139
+ raise ValueError(
1140
+ f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `text_embeds` to be passed in `added_cond_kwargs`"
1141
+ )
1142
+ text_embeds = added_cond_kwargs.get("text_embeds")
1143
+ if "time_ids" not in added_cond_kwargs:
1144
+ raise ValueError(
1145
+ f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `time_ids` to be passed in `added_cond_kwargs`"
1146
+ )
1147
+ time_ids = added_cond_kwargs.get("time_ids")
1148
+ time_embeds = self.add_time_proj(time_ids.flatten())
1149
+ time_embeds = time_embeds.reshape((text_embeds.shape[0], -1))
1150
+ add_embeds = torch.concat([text_embeds, time_embeds], dim=-1)
1151
+ add_embeds = add_embeds.to(emb.dtype)
1152
+ aug_emb = self.add_embedding(add_embeds)
1153
+ elif self.config.addition_embed_type == "image":
1154
+ # Kandinsky 2.2 - style
1155
+ if "image_embeds" not in added_cond_kwargs:
1156
+ raise ValueError(
1157
+ f"{self.__class__} has the config param `addition_embed_type` set to 'image' which requires the keyword argument `image_embeds` to be passed in `added_cond_kwargs`"
1158
+ )
1159
+ image_embs = added_cond_kwargs.get("image_embeds")
1160
+ aug_emb = self.add_embedding(image_embs)
1161
+ elif self.config.addition_embed_type == "image_hint":
1162
+ # Kandinsky 2.2 - style
1163
+ if "image_embeds" not in added_cond_kwargs or "hint" not in added_cond_kwargs:
1164
+ raise ValueError(
1165
+ f"{self.__class__} has the config param `addition_embed_type` set to 'image_hint' which requires the keyword arguments `image_embeds` and `hint` to be passed in `added_cond_kwargs`"
1166
+ )
1167
+ image_embs = added_cond_kwargs.get("image_embeds")
1168
+ hint = added_cond_kwargs.get("hint")
1169
+ aug_emb, hint = self.add_embedding(image_embs, hint)
1170
+ sample = torch.cat([sample, hint], dim=1)
1171
+
1172
+ emb = emb + aug_emb if aug_emb is not None else emb
1173
+
1174
+ if self.time_embed_act is not None:
1175
+ emb = self.time_embed_act(emb)
1176
+
1177
+ if self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "text_proj":
1178
+ encoder_hidden_states = self.encoder_hid_proj(encoder_hidden_states)
1179
+ elif self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "text_image_proj":
1180
+ # Kadinsky 2.1 - style
1181
+ if "image_embeds" not in added_cond_kwargs:
1182
+ raise ValueError(
1183
+ f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'text_image_proj' which requires the keyword argument `image_embeds` to be passed in `added_conditions`"
1184
+ )
1185
+
1186
+ image_embeds = added_cond_kwargs.get("image_embeds")
1187
+ encoder_hidden_states = self.encoder_hid_proj(encoder_hidden_states, image_embeds)
1188
+ elif self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "image_proj":
1189
+ # Kandinsky 2.2 - style
1190
+ if "image_embeds" not in added_cond_kwargs:
1191
+ raise ValueError(
1192
+ f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'image_proj' which requires the keyword argument `image_embeds` to be passed in `added_conditions`"
1193
+ )
1194
+ image_embeds = added_cond_kwargs.get("image_embeds")
1195
+ encoder_hidden_states = self.encoder_hid_proj(image_embeds)
1196
+ elif self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "ip_image_proj":
1197
+ if "image_embeds" not in added_cond_kwargs:
1198
+ raise ValueError(
1199
+ f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'ip_image_proj' which requires the keyword argument `image_embeds` to be passed in `added_conditions`"
1200
+ )
1201
+ image_embeds = added_cond_kwargs.get("image_embeds")
1202
+ image_embeds = self.encoder_hid_proj(image_embeds).to(encoder_hidden_states.dtype)
1203
+ encoder_hidden_states = torch.cat([encoder_hidden_states, image_embeds], dim=1)
1204
+
1205
+ # 2. pre-process
1206
+ sample = self.conv_in(sample)
1207
+
1208
+ # 2.5 GLIGEN position net
1209
+ if cross_attention_kwargs is not None and cross_attention_kwargs.get("gligen", None) is not None:
1210
+ cross_attention_kwargs = cross_attention_kwargs.copy()
1211
+ gligen_args = cross_attention_kwargs.pop("gligen")
1212
+ cross_attention_kwargs["gligen"] = {"objs": self.position_net(**gligen_args)}
1213
+
1214
+ # 3. down
1215
+ lora_scale = cross_attention_kwargs.get("scale", 1.0) if cross_attention_kwargs is not None else 1.0
1216
+ if USE_PEFT_BACKEND:
1217
+ # weight the lora layers by setting `lora_scale` for each PEFT layer
1218
+ scale_lora_layers(self, lora_scale)
1219
+
1220
+ is_controlnet = mid_block_additional_residual is not None and down_block_additional_residuals is not None
1221
+ # using new arg down_intrablock_additional_residuals for T2I-Adapters, to distinguish from controlnets
1222
+ is_adapter = down_intrablock_additional_residuals is not None
1223
+ # maintain backward compatibility for legacy usage, where
1224
+ # T2I-Adapter and ControlNet both use down_block_additional_residuals arg
1225
+ # but can only use one or the other
1226
+ if not is_adapter and mid_block_additional_residual is None and down_block_additional_residuals is not None:
1227
+ deprecate(
1228
+ "T2I should not use down_block_additional_residuals",
1229
+ "1.3.0",
1230
+ "Passing intrablock residual connections with `down_block_additional_residuals` is deprecated \
1231
+ and will be removed in diffusers 1.3.0. `down_block_additional_residuals` should only be used \
1232
+ for ControlNet. Please make sure use `down_intrablock_additional_residuals` instead. ",
1233
+ standard_warn=False,
1234
+ )
1235
+ down_intrablock_additional_residuals = down_block_additional_residuals
1236
+ is_adapter = True
1237
+
1238
+ down_block_res_samples = (sample,)
1239
+ for downsample_block in self.down_blocks:
1240
+ if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
1241
+ # For t2i-adapter CrossAttnDownBlock2D
1242
+ additional_residuals = {}
1243
+ if is_adapter and len(down_intrablock_additional_residuals) > 0:
1244
+ additional_residuals["additional_residuals"] = down_intrablock_additional_residuals.pop(0)
1245
+
1246
+ sample, res_samples = downsample_block(
1247
+ hidden_states=sample,
1248
+ temb=emb,
1249
+ encoder_hidden_states=encoder_hidden_states,
1250
+ attention_mask=attention_mask,
1251
+ cross_attention_kwargs=cross_attention_kwargs,
1252
+ encoder_attention_mask=encoder_attention_mask,
1253
+ **additional_residuals,
1254
+ )
1255
+ else:
1256
+ sample, res_samples = downsample_block(hidden_states=sample, temb=emb, scale=lora_scale)
1257
+ if is_adapter and len(down_intrablock_additional_residuals) > 0:
1258
+ sample += down_intrablock_additional_residuals.pop(0)
1259
+
1260
+ down_block_res_samples += res_samples
1261
+
1262
+ if is_controlnet:
1263
+ new_down_block_res_samples = ()
1264
+
1265
+ for down_block_res_sample, down_block_additional_residual in zip(
1266
+ down_block_res_samples, down_block_additional_residuals
1267
+ ):
1268
+ down_block_res_sample = down_block_res_sample + down_block_additional_residual
1269
+ new_down_block_res_samples = new_down_block_res_samples + (down_block_res_sample,)
1270
+
1271
+ down_block_res_samples = new_down_block_res_samples
1272
+
1273
+ # 4. mid
1274
+ if self.mid_block is not None:
1275
+ if hasattr(self.mid_block, "has_cross_attention") and self.mid_block.has_cross_attention:
1276
+ sample = self.mid_block(
1277
+ sample,
1278
+ emb,
1279
+ encoder_hidden_states=encoder_hidden_states,
1280
+ attention_mask=attention_mask,
1281
+ cross_attention_kwargs=cross_attention_kwargs,
1282
+ encoder_attention_mask=encoder_attention_mask,
1283
+ )
1284
+ else:
1285
+ sample = self.mid_block(sample, emb)
1286
+
1287
+ # To support T2I-Adapter-XL
1288
+ if (
1289
+ is_adapter
1290
+ and len(down_intrablock_additional_residuals) > 0
1291
+ and sample.shape == down_intrablock_additional_residuals[0].shape
1292
+ ):
1293
+ sample += down_intrablock_additional_residuals.pop(0)
1294
+
1295
+ if is_controlnet:
1296
+ sample = sample + mid_block_additional_residual
1297
+
1298
+ # 5. up
1299
+ for i, upsample_block in enumerate(self.up_blocks):
1300
+ is_final_block = i == len(self.up_blocks) - 1
1301
+
1302
+ res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
1303
+ down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]
1304
+
1305
+ # if we have not reached the final block and need to forward the
1306
+ # upsample size, we do it here
1307
+ if not is_final_block and forward_upsample_size:
1308
+ upsample_size = down_block_res_samples[-1].shape[2:]
1309
+
1310
+ if hasattr(upsample_block, "has_cross_attention") and upsample_block.has_cross_attention:
1311
+ sample = upsample_block(
1312
+ hidden_states=sample,
1313
+ temb=emb,
1314
+ res_hidden_states_tuple=res_samples,
1315
+ encoder_hidden_states=encoder_hidden_states,
1316
+ cross_attention_kwargs=cross_attention_kwargs,
1317
+ upsample_size=upsample_size,
1318
+ attention_mask=attention_mask,
1319
+ encoder_attention_mask=encoder_attention_mask,
1320
+ )
1321
+ else:
1322
+ sample = upsample_block(
1323
+ hidden_states=sample,
1324
+ temb=emb,
1325
+ res_hidden_states_tuple=res_samples,
1326
+ upsample_size=upsample_size,
1327
+ scale=lora_scale,
1328
+ )
1329
+
1330
+ # 6. post-process
1331
+ if self.conv_norm_out:
1332
+ sample = self.conv_norm_out(sample)
1333
+ sample = self.conv_act(sample)
1334
+ sample = self.conv_out(sample)
1335
+
1336
+ if USE_PEFT_BACKEND:
1337
+ # remove `lora_scale` from each PEFT layer
1338
+ unscale_lora_layers(self, lora_scale)
1339
+
1340
+ if not return_dict:
1341
+ return (sample,)
1342
+
1343
+ return UNet2DConditionOutput(sample=sample)
vae/autoencoder_kl.py ADDED
@@ -0,0 +1,559 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2023 The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ from typing import Dict, Optional, Tuple, Union
15
+
16
+ import torch
17
+ import torch.nn as nn
18
+ from peft import LoraConfig
19
+
20
+ from diffusers.configuration_utils import ConfigMixin, register_to_config
21
+ from diffusers.loaders import FromOriginalVAEMixin
22
+ from diffusers.utils.accelerate_utils import apply_forward_hook
23
+ from diffusers.models.attention_processor import (
24
+ ADDED_KV_ATTENTION_PROCESSORS,
25
+ CROSS_ATTENTION_PROCESSORS,
26
+ Attention,
27
+ AttentionProcessor,
28
+ AttnAddedKVProcessor,
29
+ AttnProcessor,
30
+ )
31
+ from diffusers.models.modeling_outputs import AutoencoderKLOutput
32
+ from diffusers.models.modeling_utils import ModelMixin
33
+ from diffusers.models.autoencoders.vae import Decoder, DecoderOutput, DiagonalGaussianDistribution, Encoder
34
+
35
+
36
+ def my_vae_encoder_fwd(self, sample):
37
+ sample = self.conv_in(sample)
38
+ l_blocks = []
39
+ # down
40
+ for down_block in self.down_blocks:
41
+ l_blocks.append(sample)
42
+ sample = down_block(sample)
43
+ # middle
44
+ sample = self.mid_block(sample)
45
+ sample = self.conv_norm_out(sample)
46
+ sample = self.conv_act(sample)
47
+ sample = self.conv_out(sample)
48
+ self.current_down_blocks = l_blocks
49
+ return sample
50
+
51
+
52
+ def my_vae_decoder_fwd(self, sample, latent_embeds=None):
53
+ sample = self.conv_in(sample)
54
+ upscale_dtype = next(iter(self.up_blocks.parameters())).dtype
55
+ # middle
56
+ sample = self.mid_block(sample, latent_embeds)
57
+ sample = sample.to(upscale_dtype)
58
+ if not self.ignore_skip:
59
+ skip_convs = [self.skip_conv_1, self.skip_conv_2, self.skip_conv_3, self.skip_conv_4]
60
+ # up
61
+ for idx, up_block in enumerate(self.up_blocks):
62
+ skip_in = skip_convs[idx](self.incoming_skip_acts[::-1][idx] * self.gamma)
63
+ # add skip
64
+ sample = sample + skip_in
65
+ sample = up_block(sample, latent_embeds)
66
+ else:
67
+ for idx, up_block in enumerate(self.up_blocks):
68
+ sample = up_block(sample, latent_embeds)
69
+ # post-process
70
+ if latent_embeds is None:
71
+ sample = self.conv_norm_out(sample)
72
+ else:
73
+ sample = self.conv_norm_out(sample, latent_embeds)
74
+ sample = self.conv_act(sample)
75
+ sample = self.conv_out(sample)
76
+ return sample
77
+
78
+
79
+ class AutoencoderKL(ModelMixin, ConfigMixin, FromOriginalVAEMixin):
80
+ r"""
81
+ A VAE model with KL loss for encoding images into latents and decoding latent representations into images.
82
+
83
+ This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
84
+ for all models (such as downloading or saving).
85
+
86
+ Parameters:
87
+ in_channels (int, *optional*, defaults to 3): Number of channels in the input image.
88
+ out_channels (int, *optional*, defaults to 3): Number of channels in the output.
89
+ down_block_types (`Tuple[str]`, *optional*, defaults to `("DownEncoderBlock2D",)`):
90
+ Tuple of downsample block types.
91
+ up_block_types (`Tuple[str]`, *optional*, defaults to `("UpDecoderBlock2D",)`):
92
+ Tuple of upsample block types.
93
+ block_out_channels (`Tuple[int]`, *optional*, defaults to `(64,)`):
94
+ Tuple of block output channels.
95
+ act_fn (`str`, *optional*, defaults to `"silu"`): The activation function to use.
96
+ latent_channels (`int`, *optional*, defaults to 4): Number of channels in the latent space.
97
+ sample_size (`int`, *optional*, defaults to `32`): Sample input size.
98
+ scaling_factor (`float`, *optional*, defaults to 0.18215):
99
+ The component-wise standard deviation of the trained latent space computed using the first batch of the
100
+ training set. This is used to scale the latent space to have unit variance when training the diffusion
101
+ model. The latents are scaled with the formula `z = z * scaling_factor` before being passed to the
102
+ diffusion model. When decoding, the latents are scaled back to the original scale with the formula: `z = 1
103
+ / scaling_factor * z`. For more details, refer to sections 4.3.2 and D.1 of the [High-Resolution Image
104
+ Synthesis with Latent Diffusion Models](https://arxiv.org/abs/2112.10752) paper.
105
+ force_upcast (`bool`, *optional*, default to `True`):
106
+ If enabled it will force the VAE to run in float32 for high image resolution pipelines, such as SD-XL. VAE
107
+ can be fine-tuned / trained to a lower range without loosing too much precision in which case
108
+ `force_upcast` can be set to `False` - see: https://huggingface.co/madebyollin/sdxl-vae-fp16-fix
109
+ """
110
+
111
+ _supports_gradient_checkpointing = True
112
+
113
+ @register_to_config
114
+ def __init__(
115
+ self,
116
+ in_channels: int = 3,
117
+ out_channels: int = 3,
118
+ down_block_types: Tuple[str] = ("DownEncoderBlock2D",),
119
+ up_block_types: Tuple[str] = ("UpDecoderBlock2D",),
120
+ block_out_channels: Tuple[int] = (64,),
121
+ layers_per_block: int = 1,
122
+ act_fn: str = "silu",
123
+ latent_channels: int = 4,
124
+ norm_num_groups: int = 32,
125
+ sample_size: int = 32,
126
+ scaling_factor: float = 0.18215,
127
+ force_upcast: float = True,
128
+ lora_rank: int = 4,
129
+ gamma: float = 1.0,
130
+ ignore_skip: bool = False,
131
+ ):
132
+ super().__init__()
133
+
134
+ # pass init params to Encoder
135
+ self.encoder = Encoder(
136
+ in_channels=in_channels,
137
+ out_channels=latent_channels,
138
+ down_block_types=down_block_types,
139
+ block_out_channels=block_out_channels,
140
+ layers_per_block=layers_per_block,
141
+ act_fn=act_fn,
142
+ norm_num_groups=norm_num_groups,
143
+ double_z=True,
144
+ )
145
+
146
+ # pass init params to Decoder
147
+ self.decoder = Decoder(
148
+ in_channels=latent_channels,
149
+ out_channels=out_channels,
150
+ up_block_types=up_block_types,
151
+ block_out_channels=block_out_channels,
152
+ layers_per_block=layers_per_block,
153
+ norm_num_groups=norm_num_groups,
154
+ act_fn=act_fn,
155
+ )
156
+
157
+ self.quant_conv = nn.Conv2d(2 * latent_channels, 2 * latent_channels, 1)
158
+ self.post_quant_conv = nn.Conv2d(latent_channels, latent_channels, 1)
159
+
160
+ self.use_slicing = False
161
+ self.use_tiling = False
162
+
163
+ # only relevant if vae tiling is enabled
164
+ self.tile_sample_min_size = self.config.sample_size
165
+ sample_size = (
166
+ self.config.sample_size[0]
167
+ if isinstance(self.config.sample_size, (list, tuple))
168
+ else self.config.sample_size
169
+ )
170
+ self.tile_latent_min_size = int(sample_size / (2 ** (len(self.config.block_out_channels) - 1)))
171
+ self.tile_overlap_factor = 0.25
172
+
173
+ self.encoder.forward = my_vae_encoder_fwd.__get__(self.encoder, self.encoder.__class__)
174
+ self.decoder.forward = my_vae_decoder_fwd.__get__(self.decoder, self.decoder.__class__)
175
+ # add the skip connection convs
176
+ self.decoder.skip_conv_1 = torch.nn.Conv2d(512, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
177
+ self.decoder.skip_conv_2 = torch.nn.Conv2d(256, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
178
+ self.decoder.skip_conv_3 = torch.nn.Conv2d(128, 512, kernel_size=(1, 1), stride=(1, 1), bias=False)
179
+ self.decoder.skip_conv_4 = torch.nn.Conv2d(128, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
180
+ self.decoder.ignore_skip = ignore_skip
181
+ self.decoder.gamma = gamma
182
+
183
+ target_modules_vae = ["conv1", "conv2", "conv_in", "conv_shortcut", "conv", "conv_out",
184
+ "skip_conv_1", "skip_conv_2", "skip_conv_3", "skip_conv_4",
185
+ "to_k", "to_q", "to_v", "to_out.0",
186
+ ]
187
+ target_modules = []
188
+ for id, (name, param) in enumerate(self.named_modules()):
189
+ if 'decoder' in name and any(name.endswith(x) for x in target_modules_vae):
190
+ target_modules.append(name)
191
+ target_modules_vae = target_modules
192
+
193
+ vae_lora_config = LoraConfig(r=lora_rank, init_lora_weights="gaussian", target_modules=target_modules_vae)
194
+ self.add_adapter(vae_lora_config, adapter_name="vae_skip")
195
+
196
+ def _set_gradient_checkpointing(self, module, value=False):
197
+ if isinstance(module, (Encoder, Decoder)):
198
+ module.gradient_checkpointing = value
199
+
200
+ def enable_tiling(self, use_tiling: bool = True):
201
+ r"""
202
+ Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
203
+ compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
204
+ processing larger images.
205
+ """
206
+ self.use_tiling = use_tiling
207
+
208
+ def disable_tiling(self):
209
+ r"""
210
+ Disable tiled VAE decoding. If `enable_tiling` was previously enabled, this method will go back to computing
211
+ decoding in one step.
212
+ """
213
+ self.enable_tiling(False)
214
+
215
+ def enable_slicing(self):
216
+ r"""
217
+ Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
218
+ compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
219
+ """
220
+ self.use_slicing = True
221
+
222
+ def disable_slicing(self):
223
+ r"""
224
+ Disable sliced VAE decoding. If `enable_slicing` was previously enabled, this method will go back to computing
225
+ decoding in one step.
226
+ """
227
+ self.use_slicing = False
228
+
229
+ @property
230
+ # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.attn_processors
231
+ def attn_processors(self) -> Dict[str, AttentionProcessor]:
232
+ r"""
233
+ Returns:
234
+ `dict` of attention processors: A dictionary containing all attention processors used in the model with
235
+ indexed by its weight name.
236
+ """
237
+ # set recursively
238
+ processors = {}
239
+
240
+ def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
241
+ if hasattr(module, "get_processor"):
242
+ processors[f"{name}.processor"] = module.get_processor(return_deprecated_lora=True)
243
+
244
+ for sub_name, child in module.named_children():
245
+ fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
246
+
247
+ return processors
248
+
249
+ for name, module in self.named_children():
250
+ fn_recursive_add_processors(name, module, processors)
251
+
252
+ return processors
253
+
254
+ # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.set_attn_processor
255
+ def set_attn_processor(
256
+ self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]], _remove_lora=False
257
+ ):
258
+ r"""
259
+ Sets the attention processor to use to compute attention.
260
+
261
+ Parameters:
262
+ processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
263
+ The instantiated processor class or a dictionary of processor classes that will be set as the processor
264
+ for **all** `Attention` layers.
265
+
266
+ If `processor` is a dict, the key needs to define the path to the corresponding cross attention
267
+ processor. This is strongly recommended when setting trainable attention processors.
268
+
269
+ """
270
+ count = len(self.attn_processors.keys())
271
+
272
+ if isinstance(processor, dict) and len(processor) != count:
273
+ raise ValueError(
274
+ f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
275
+ f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
276
+ )
277
+
278
+ def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
279
+ if hasattr(module, "set_processor"):
280
+ if not isinstance(processor, dict):
281
+ module.set_processor(processor, _remove_lora=_remove_lora)
282
+ else:
283
+ module.set_processor(processor.pop(f"{name}.processor"), _remove_lora=_remove_lora)
284
+
285
+ for sub_name, child in module.named_children():
286
+ fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
287
+
288
+ for name, module in self.named_children():
289
+ fn_recursive_attn_processor(name, module, processor)
290
+
291
+ # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.set_default_attn_processor
292
+ def set_default_attn_processor(self):
293
+ """
294
+ Disables custom attention processors and sets the default attention implementation.
295
+ """
296
+ if all(proc.__class__ in ADDED_KV_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
297
+ processor = AttnAddedKVProcessor()
298
+ elif all(proc.__class__ in CROSS_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
299
+ processor = AttnProcessor()
300
+ else:
301
+ raise ValueError(
302
+ f"Cannot call `set_default_attn_processor` when attention processors are of type {next(iter(self.attn_processors.values()))}"
303
+ )
304
+
305
+ self.set_attn_processor(processor, _remove_lora=True)
306
+
307
+ @apply_forward_hook
308
+ def encode(
309
+ self, x: torch.FloatTensor, return_dict: bool = True
310
+ ) -> Union[AutoencoderKLOutput, Tuple[DiagonalGaussianDistribution]]:
311
+ """
312
+ Encode a batch of images into latents.
313
+
314
+ Args:
315
+ x (`torch.FloatTensor`): Input batch of images.
316
+ return_dict (`bool`, *optional*, defaults to `True`):
317
+ Whether to return a [`~models.autoencoder_kl.AutoencoderKLOutput`] instead of a plain tuple.
318
+
319
+ Returns:
320
+ The latent representations of the encoded images. If `return_dict` is True, a
321
+ [`~models.autoencoder_kl.AutoencoderKLOutput`] is returned, otherwise a plain `tuple` is returned.
322
+ """
323
+ if self.use_tiling and (x.shape[-1] > self.tile_sample_min_size or x.shape[-2] > self.tile_sample_min_size):
324
+ return self.tiled_encode(x, return_dict=return_dict)
325
+
326
+ if self.use_slicing and x.shape[0] > 1:
327
+ encoded_slices = [self.encoder(x_slice) for x_slice in x.split(1)]
328
+ h = torch.cat(encoded_slices)
329
+ else:
330
+ h = self.encoder(x)
331
+
332
+ moments = self.quant_conv(h)
333
+ posterior = DiagonalGaussianDistribution(moments)
334
+
335
+ if not return_dict:
336
+ return (posterior,)
337
+
338
+ return AutoencoderKLOutput(latent_dist=posterior)
339
+
340
+ def _decode(self, z: torch.FloatTensor, return_dict: bool = True) -> Union[DecoderOutput, torch.FloatTensor]:
341
+ if self.use_tiling and (z.shape[-1] > self.tile_latent_min_size or z.shape[-2] > self.tile_latent_min_size):
342
+ return self.tiled_decode(z, return_dict=return_dict)
343
+
344
+ z = self.post_quant_conv(z)
345
+ dec = self.decoder(z)
346
+
347
+ if not return_dict:
348
+ return (dec,)
349
+
350
+ return DecoderOutput(sample=dec)
351
+
352
+ @apply_forward_hook
353
+ def decode(
354
+ self, z: torch.FloatTensor, return_dict: bool = True, generator=None
355
+ ) -> Union[DecoderOutput, torch.FloatTensor]:
356
+ """
357
+ Decode a batch of images.
358
+
359
+ Args:
360
+ z (`torch.FloatTensor`): Input batch of latent vectors.
361
+ return_dict (`bool`, *optional*, defaults to `True`):
362
+ Whether to return a [`~models.vae.DecoderOutput`] instead of a plain tuple.
363
+
364
+ Returns:
365
+ [`~models.vae.DecoderOutput`] or `tuple`:
366
+ If return_dict is True, a [`~models.vae.DecoderOutput`] is returned, otherwise a plain `tuple` is
367
+ returned.
368
+
369
+ """
370
+ if self.use_slicing and z.shape[0] > 1:
371
+ decoded_slices = [self._decode(z_slice).sample for z_slice in z.split(1)]
372
+ decoded = torch.cat(decoded_slices)
373
+ else:
374
+ decoded = self._decode(z).sample
375
+
376
+ if not return_dict:
377
+ return (decoded,)
378
+
379
+ return DecoderOutput(sample=decoded)
380
+
381
+ def blend_v(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor:
382
+ blend_extent = min(a.shape[2], b.shape[2], blend_extent)
383
+ for y in range(blend_extent):
384
+ b[:, :, y, :] = a[:, :, -blend_extent + y, :] * (1 - y / blend_extent) + b[:, :, y, :] * (y / blend_extent)
385
+ return b
386
+
387
+ def blend_h(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor:
388
+ blend_extent = min(a.shape[3], b.shape[3], blend_extent)
389
+ for x in range(blend_extent):
390
+ b[:, :, :, x] = a[:, :, :, -blend_extent + x] * (1 - x / blend_extent) + b[:, :, :, x] * (x / blend_extent)
391
+ return b
392
+
393
+ def tiled_encode(self, x: torch.FloatTensor, return_dict: bool = True) -> AutoencoderKLOutput:
394
+ r"""Encode a batch of images using a tiled encoder.
395
+
396
+ When this option is enabled, the VAE will split the input tensor into tiles to compute encoding in several
397
+ steps. This is useful to keep memory use constant regardless of image size. The end result of tiled encoding is
398
+ different from non-tiled encoding because each tile uses a different encoder. To avoid tiling artifacts, the
399
+ tiles overlap and are blended together to form a smooth output. You may still see tile-sized changes in the
400
+ output, but they should be much less noticeable.
401
+
402
+ Args:
403
+ x (`torch.FloatTensor`): Input batch of images.
404
+ return_dict (`bool`, *optional*, defaults to `True`):
405
+ Whether or not to return a [`~models.autoencoder_kl.AutoencoderKLOutput`] instead of a plain tuple.
406
+
407
+ Returns:
408
+ [`~models.autoencoder_kl.AutoencoderKLOutput`] or `tuple`:
409
+ If return_dict is True, a [`~models.autoencoder_kl.AutoencoderKLOutput`] is returned, otherwise a plain
410
+ `tuple` is returned.
411
+ """
412
+ overlap_size = int(self.tile_sample_min_size * (1 - self.tile_overlap_factor))
413
+ blend_extent = int(self.tile_latent_min_size * self.tile_overlap_factor)
414
+ row_limit = self.tile_latent_min_size - blend_extent
415
+
416
+ # Split the image into 512x512 tiles and encode them separately.
417
+ rows = []
418
+ for i in range(0, x.shape[2], overlap_size):
419
+ row = []
420
+ for j in range(0, x.shape[3], overlap_size):
421
+ tile = x[:, :, i : i + self.tile_sample_min_size, j : j + self.tile_sample_min_size]
422
+ tile = self.encoder(tile)
423
+ tile = self.quant_conv(tile)
424
+ row.append(tile)
425
+ rows.append(row)
426
+ result_rows = []
427
+ for i, row in enumerate(rows):
428
+ result_row = []
429
+ for j, tile in enumerate(row):
430
+ # blend the above tile and the left tile
431
+ # to the current tile and add the current tile to the result row
432
+ if i > 0:
433
+ tile = self.blend_v(rows[i - 1][j], tile, blend_extent)
434
+ if j > 0:
435
+ tile = self.blend_h(row[j - 1], tile, blend_extent)
436
+ result_row.append(tile[:, :, :row_limit, :row_limit])
437
+ result_rows.append(torch.cat(result_row, dim=3))
438
+
439
+ moments = torch.cat(result_rows, dim=2)
440
+ posterior = DiagonalGaussianDistribution(moments)
441
+
442
+ if not return_dict:
443
+ return (posterior,)
444
+
445
+ return AutoencoderKLOutput(latent_dist=posterior)
446
+
447
+ def tiled_decode(self, z: torch.FloatTensor, return_dict: bool = True) -> Union[DecoderOutput, torch.FloatTensor]:
448
+ r"""
449
+ Decode a batch of images using a tiled decoder.
450
+
451
+ Args:
452
+ z (`torch.FloatTensor`): Input batch of latent vectors.
453
+ return_dict (`bool`, *optional*, defaults to `True`):
454
+ Whether or not to return a [`~models.vae.DecoderOutput`] instead of a plain tuple.
455
+
456
+ Returns:
457
+ [`~models.vae.DecoderOutput`] or `tuple`:
458
+ If return_dict is True, a [`~models.vae.DecoderOutput`] is returned, otherwise a plain `tuple` is
459
+ returned.
460
+ """
461
+ overlap_size = int(self.tile_latent_min_size * (1 - self.tile_overlap_factor))
462
+ blend_extent = int(self.tile_sample_min_size * self.tile_overlap_factor)
463
+ row_limit = self.tile_sample_min_size - blend_extent
464
+
465
+ # Split z into overlapping 64x64 tiles and decode them separately.
466
+ # The tiles have an overlap to avoid seams between tiles.
467
+ rows = []
468
+ for i in range(0, z.shape[2], overlap_size):
469
+ row = []
470
+ for j in range(0, z.shape[3], overlap_size):
471
+ tile = z[:, :, i : i + self.tile_latent_min_size, j : j + self.tile_latent_min_size]
472
+ tile = self.post_quant_conv(tile)
473
+ decoded = self.decoder(tile)
474
+ row.append(decoded)
475
+ rows.append(row)
476
+ result_rows = []
477
+ for i, row in enumerate(rows):
478
+ result_row = []
479
+ for j, tile in enumerate(row):
480
+ # blend the above tile and the left tile
481
+ # to the current tile and add the current tile to the result row
482
+ if i > 0:
483
+ tile = self.blend_v(rows[i - 1][j], tile, blend_extent)
484
+ if j > 0:
485
+ tile = self.blend_h(row[j - 1], tile, blend_extent)
486
+ result_row.append(tile[:, :, :row_limit, :row_limit])
487
+ result_rows.append(torch.cat(result_row, dim=3))
488
+
489
+ dec = torch.cat(result_rows, dim=2)
490
+ if not return_dict:
491
+ return (dec,)
492
+
493
+ return DecoderOutput(sample=dec)
494
+
495
+ def forward(
496
+ self,
497
+ sample: torch.FloatTensor,
498
+ sample_posterior: bool = False,
499
+ return_dict: bool = True,
500
+ generator: Optional[torch.Generator] = None,
501
+ ) -> Union[DecoderOutput, torch.FloatTensor]:
502
+ r"""
503
+ Args:
504
+ sample (`torch.FloatTensor`): Input sample.
505
+ sample_posterior (`bool`, *optional*, defaults to `False`):
506
+ Whether to sample from the posterior.
507
+ return_dict (`bool`, *optional*, defaults to `True`):
508
+ Whether or not to return a [`DecoderOutput`] instead of a plain tuple.
509
+ """
510
+ x = sample
511
+ posterior = self.encode(x).latent_dist
512
+ if sample_posterior:
513
+ z = posterior.sample(generator=generator)
514
+ else:
515
+ z = posterior.mode()
516
+ dec = self.decode(z).sample
517
+
518
+ if not return_dict:
519
+ return (dec,)
520
+
521
+ return DecoderOutput(sample=dec)
522
+
523
+ # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.fuse_qkv_projections
524
+ def fuse_qkv_projections(self):
525
+ """
526
+ Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query,
527
+ key, value) are fused. For cross-attention modules, key and value projection matrices are fused.
528
+
529
+ <Tip warning={true}>
530
+
531
+ This API is 🧪 experimental.
532
+
533
+ </Tip>
534
+ """
535
+ self.original_attn_processors = None
536
+
537
+ for _, attn_processor in self.attn_processors.items():
538
+ if "Added" in str(attn_processor.__class__.__name__):
539
+ raise ValueError("`fuse_qkv_projections()` is not supported for models having added KV projections.")
540
+
541
+ self.original_attn_processors = self.attn_processors
542
+
543
+ for module in self.modules():
544
+ if isinstance(module, Attention):
545
+ module.fuse_projections(fuse=True)
546
+
547
+ # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.unfuse_qkv_projections
548
+ def unfuse_qkv_projections(self):
549
+ """Disables the fused QKV projection if enabled.
550
+
551
+ <Tip warning={true}>
552
+
553
+ This API is 🧪 experimental.
554
+
555
+ </Tip>
556
+
557
+ """
558
+ if self.original_attn_processors is not None:
559
+ self.set_attn_processor(self.original_attn_processors)
vae/config.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "AutoencoderKL",
3
+ "_diffusers_version": "0.25.1",
4
+ "_name_or_path": "nvidia/difix_ref/vae",
5
+ "act_fn": "silu",
6
+ "block_out_channels": [
7
+ 128,
8
+ 256,
9
+ 512,
10
+ 512
11
+ ],
12
+ "down_block_types": [
13
+ "DownEncoderBlock2D",
14
+ "DownEncoderBlock2D",
15
+ "DownEncoderBlock2D",
16
+ "DownEncoderBlock2D"
17
+ ],
18
+ "force_upcast": true,
19
+ "gamma": 1.0,
20
+ "ignore_skip": false,
21
+ "in_channels": 3,
22
+ "latent_channels": 4,
23
+ "layers_per_block": 2,
24
+ "lora_rank": 4,
25
+ "norm_num_groups": 32,
26
+ "out_channels": 3,
27
+ "sample_size": 768,
28
+ "scaling_factor": 0.18215,
29
+ "up_block_types": [
30
+ "UpDecoderBlock2D",
31
+ "UpDecoderBlock2D",
32
+ "UpDecoderBlock2D",
33
+ "UpDecoderBlock2D"
34
+ ]
35
+ }
vae/diffusion_pytorch_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d3aa93824c839302d1103d72b0ea933df65206945b0e93140328562fffa6cf65
3
+ size 338717612