wangfuyun commited on 20 days ago

Commit

522bf24

verified ·

1 Parent(s): f9f4433

Add UniRL inference code

Browse files

Files changed (23) hide show

.gitattributes +3 -0
.gitignore +149 -0
LICENSE +201 -0
README.md +155 -0
assets/edit_comparison.png +3 -0
assets/logo.png +3 -0
assets/t2i_comparison.png +3 -0
environment.yml +238 -0
eval.py +367 -0
gen.sh +48 -0
prompts/config.yaml +35 -0
prompts/draw_test.txt +1000 -0
prompts/evaluation_metadata.jsonl +553 -0
prompts/ocr_test.txt +0 -0
requirements.txt +202 -0
unified_inference.py +660 -0
unimodel/qwenflux/fluxpipeline.py +1543 -0
unimodel/qwenflux/qwenflux_inference.py +418 -0
unimodel/qwenkontext/fluxkontext_pipeline.py +1161 -0
unimodel/qwenkontext/qwenkontext_inference.py +442 -0
unimodel/qwensana/qwensana_inference.py +310 -0
unimodel/qwensd3/qwensd3_inference.py +447 -0
unimodel/qwensd3/sd3pipeline.py +1162 -0

.gitattributes CHANGED Viewed

@@ -37,3 +37,6 @@ promptrl_geneval/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 promptrl_ocr/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 promptrl_ps/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 promptrl_edit/tokenizer.json filter=lfs diff=lfs merge=lfs -text

 promptrl_ocr/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 promptrl_ps/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 promptrl_edit/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+assets/edit_comparison.png filter=lfs diff=lfs merge=lfs -text
+assets/logo.png filter=lfs diff=lfs merge=lfs -text
+assets/t2i_comparison.png filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,149 @@

+# Python
+*.pyc
+__pycache__/
+*.pyo
+*.pyd
+.Python
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Byte-compiled / optimized / DLL files
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+# PyInstaller
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+.python-version
+# pipenv
+Pipfile.lock
+# Poetry
+poetry.lock
+# Virtualenv
+.venv
+venv/
+ENV/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# IDEs and editors
+.idea/
+.vscode/
+*.sublime-workspace
+# OS generated files
+.DS_Store
+Thumbs.db
+# Logs
+*.log
+logs/
+*.log.*
+# Dependency directories
+node_modules/
+bower_components/
+# Optional: Local configuration files
+*.local
+*.env
+.env
+.env.local
+.env.development.local
+.env.test.local
+.env.production.local
+# Optional: Database
+*.sqlite3
+*.db
+# Optional: Django
+*.sqlite3
+migrations/
+*.mo
+*.pot
+staticfiles/
+# Optional: Flask
+instance/
+.webassets-cache
+# Optional: Scrapy
+.scrapy
+outputs/
+wandb/
+assets/large_rl_datasets/
+utils/parquet_cache/

LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [2025] [Fu-Yun Wang]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

README.md ADDED Viewed

	@@ -0,0 +1,155 @@

+<p align="center">
+  <img src="assets/logo.png" width="30%"><br>
+  PromptRL
+</p>
+<p align="center">
+  <a href="https://arxiv.org/abs/2602.01382"><img src="https://img.shields.io/badge/arXiv-2602.01382-b31b1b.svg" alt="arXiv"></a>
+  <a href="https://g-u-n.github.io/projects/promptrl/"><img src="https://img.shields.io/badge/Project-Page-green.svg" alt="Project Page"></a>
+  <a href="https://huggingface.co/wangfuyun/PrompRL"><img src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Models-blue" alt="HuggingFace"></a>
+</p>
+## Overview
+**PromptRL** is a framework that jointly trains language models (LMs) and flow-matching models (FMs) within a unified reinforcement learning loop for text-to-image generation. By incorporating LMs as adaptive prompt refiners, PromptRL addresses two critical limitations in current flow-based RL pipelines: *exploration collapse* due to insufficient generation diversity, and *prompt overfitting* where models memorize specific training formulations.
+## Installation
+```bash
+conda env create -f environment.yml
+conda activate unirl
+pip install git+https://github.com/openai/CLIP.git
+pip install git+https://github.com/huggingface/diffusers.git
+pip install flash-attn==2.7.4.post1 --no-build-isolation
+# run gen.sh for evaluation
+# bash gen.sh
+```
+## Qualitative Results
+### Text-to-Image Generation
+<p align="center">
+  <img src="assets/t2i_comparison.png" width="85%">
+</p>
+### Instructional Image Editing
+<p align="center">
+  <img src="assets/edit_comparison.png" width="75%">
+</p>
+## Key Results
+PromptRL achieves **2× sample efficiency** compared to flow-only RL while obtains a adaptative prompt refinement agent to improve test-time performance.
+### Summary
+| Benchmark | Metric | PromptRL w/ PE | Best Baseline |
+|:---|:---|:---:|:---:|
+| GenEval | Avg. Score ↑ | **0.97** | 0.92 (FlowGRPO) |
+| Aesthetic | PickScore ↑ | **24.05** | 23.63 (DiffusionNFT) |
+| Aesthetic | HPS ↑ | **32.03** | 31.79 (DiffusionNFT) |
+| OCR | OCR-1k ↑ | **0.98** | 0.89 (FlowGRPO) |
+| Image Editing | EditReward Avg. ↑ | **1.43** | 1.44 (ReasonEdit-Think) |
+---
+<details>
+<summary><b>📊 GenEval Benchmark (Full Results)</b></summary>
+<br>
+| Model | 1 Obj. | 2 Obj. | Cnt. | Clr. | Pos. | Attr. | Avg.↑ |
+|:---|:---:|:---:|:---:|:---:|:---:|:---:|:---:|
+| Show-o | 0.95 | 0.52 | 0.49 | 0.82 | 0.11 | 0.28 | 0.53 |
+| Emu3-Gen | 0.98 | 0.71 | 0.34 | 0.81 | 0.17 | 0.21 | 0.54 |
+| SD3 Medium | 0.98 | 0.74 | 0.63 | 0.67 | 0.34 | 0.36 | 0.62 |
+| FLUX.1-dev | 0.98 | 0.81 | 0.74 | 0.79 | 0.22 | 0.45 | 0.66 |
+| SD3.5 Large | 0.98 | 0.89 | 0.73 | 0.83 | 0.34 | 0.47 | 0.71 |
+| JanusFlow | 0.97 | 0.59 | 0.45 | 0.83 | 0.53 | 0.42 | 0.63 |
+| Janus-Pro-7B | 0.99 | 0.89 | 0.59 | 0.90 | 0.79 | 0.66 | 0.80 |
+| HiDream | 1.00 | 0.98 | 0.79 | 0.91 | 0.60 | 0.72 | 0.83 |
+| Seedream 3.0 | 0.99 | 0.96 | 0.91 | 0.93 | 0.47 | 0.80 | 0.84 |
+| Qwen-Image | 0.99 | 0.92 | 0.89 | 0.88 | 0.76 | 0.77 | 0.87 |
+| *RL-based* |  |  |  |  |  |  |  |
+| RePrompt | 0.98 | 0.87 | 0.77 | 0.85 | 0.62 | 0.49 | 0.76 |
+| FlowGRPO | 1.00 | 0.99 | 0.91 | 0.89 | 0.95 | 0.80 | 0.92 |
+| DiffusionNFT | 1.00 | 0.98 | 0.74 | 0.92 | 0.85 | 0.80 | 0.88 |
+| PromptRL w/o PE | 1.00 | 0.96 | 0.95 | 0.95 | 0.93 | 0.85 | 0.94 |
+| **PromptRL w/ PE** | **1.00** | **0.99** | **0.99** | **0.96** | **0.99** | **0.90** | **0.97** |
+</details>
+<details>
+<summary><b>🎨 Aesthetic & OCR Metrics (Full Results)</b></summary>
+<br>
+| Model | P.S. | HPS | U.R. | OCR-1k | TMDB | OpenLib |
+|:---|:---:|:---:|:---:|:---:|:---:|:---:|
+| SD1.5 | 20.92 | 23.71 | 2.00 | 0.05 | 0.13 | 0.08 |
+| SDXL | 22.14 | 26.67 | 2.78 | 0.13 | 0.20 | 0.09 |
+| SD3 Medium | 22.38 | 28.56 | 3.09 | — | 0.44 | 0.33 |
+| FLUX.1-schnell | 22.64 | 29.39 | 3.25 | 0.54 | 0.66 | 0.50 |
+| FLUX.2-klein | 22.79 | 29.03 | 3.29 | 0.55 | 0.22 | 0.46 |
+| Z-Image | 20.14 | 28.22 | 3.51 | 0.70 | 0.71 | 0.83 |
+| Qwen-Image | 23.05 | 30.40 | 3.53 | 0.65 | 0.79 | 0.94 |
+| Qwen-Image-2512 | 23.16 | 30.79 | 3.40 | 0.72 | 0.81 | 0.87 |
+| *RL-based* |  |  |  |  |  |  |
+| FlowGRPO | 23.33 | 29.80 | 3.33 | 0.89 | 0.83 | 0.73 |
+| DiffusionNFT | 23.63 | 31.79 | 3.39 | 0.89 | 0.91 | 0.86 |
+| PromptRL w/o PE | 24.01 | 31.79 | 3.38 | 0.97 | 0.92 | 0.95 |
+| **PromptRL w/ PE** | **24.05** | **32.03** | **3.44** | **0.98** | **0.91** | **0.95** |
+</details>
+<details>
+<summary><b>✏️ Image Editing - EditReward (Full Results)</b></summary>
+<br>
+| Model | Swap | Style | Add. | Attr. | Env. | Removal | Avg.↑ |
+|:---|:---:|:---:|:---:|:---:|:---:|:---:|:---:|
+| InstructPix2Pix | -0.24 | 0.91 | -0.45 | 0.45 | 0.48 | -0.80 | 0.02 |
+| MagicBrush | -0.38 | 0.36 | -0.78 | -0.80 | 0.91 | -0.85 | -0.27 |
+| LEDITS++ | -0.81 | -0.32 | -0.30 | -0.60 | -0.37 | -0.97 | -0.60 |
+| Qwen-Image-Edit | 1.11 | 1.14 | 0.95 | 0.90 | 1.39 | 0.61 | 1.03 |
+| FLUX.2-klein | 1.42 | 1.73 | 1.29 | 1.42 | 1.80 | 0.32 | 1.34 |
+| Nano Banana | 1.58 | 1.20 | 1.28 | 1.18 | 1.61 | 1.13 | 1.37 |
+| Step1X-Edit | 1.39 | 1.58 | 1.19 | 1.34 | 1.57 | 0.22 | 1.24 |
+| ReasonEdit | 1.51 | 1.43 | 1.19 | 1.47 | 1.58 | 1.14 | 1.40 |
+| ReasonEdit-Think | 1.52 | 1.47 | 1.19 | 1.44 | 1.69 | 1.27 | 1.44 |
+| FLUX.1-Kontext | 1.35 | 1.36 | 1.16 | 1.15 | 1.44 | 0.55 | 1.19 |
+| FLUX.1-Kontext w/ PE | 1.35 | 0.97 | 1.04 | 0.48 | 1.22 | 0.65 | 1.01 |
+| PromptRL w/o PE | 1.45 | 1.46 | 1.28 | 1.35 | 1.56 | 0.98 | 1.36 |
+| **PromptRL w/ PE** | **1.47** | **1.43** | **1.29** | **1.39** | **1.72** | **1.24** | **1.43** |
+</details>
+## Citation
+```bibtext
+@article{wang2025promptrl,
+  title={PromptRL: Prompt Matters in RL for Flow-Based Image Generation},
+  author={Wang, Fu-Yun and Zhang, Han and Gharbi, Michael and Li, Hongsheng and Park, Taesung},
+  journal={arXiv preprint arXiv:2602.01382},
+  year={2026}
+}
+```
+```bibtext
+@article{wang2025unirl,
+  title={UniRL-Zero: Reinforcement Learning on Unified Models with Joint Language Model and Diffusion Model Experts},
+  author={Wang, Fu-Yun and Zhang, Han and Gharbi, Michael and Li, Hongsheng and Park, Taesung},
+  journal={arXiv preprint arXiv:2510.17937},
+  year={2025}
+}
+```
+## Acknowledgments
+This codebase builds upon [UniRL-Zero](https://github.com/G-U-N/UniRL/tree/master).

assets/edit_comparison.png ADDED Viewed

Git LFS Details

SHA256: 7c9ca476030f9ea93db556f9157a3b94d113deaf7e18857a1712d33f9727f6ee
Pointer size: 132 Bytes
Size of remote file: 5.64 MB

assets/logo.png ADDED Viewed

Git LFS Details

SHA256: 9feacdd6b0ae47cc59b2bded2d663e66014fd8c642fbf945a9c0f819e675cfbe
Pointer size: 132 Bytes
Size of remote file: 1.48 MB

assets/t2i_comparison.png ADDED Viewed

Git LFS Details

SHA256: 23470ac01392176140c71274f9353dfb4a06c311c92c2c3d5b2c5b9064117e09
Pointer size: 132 Bytes
Size of remote file: 6.63 MB

environment.yml ADDED Viewed

	@@ -0,0 +1,238 @@

+name: unirl
+channels:
+  - defaults
+dependencies:
+  - _libgcc_mutex=0.1=main
+  - _openmp_mutex=5.1=1_gnu
+  - bzip2=1.0.8=h5eee18b_6
+  - ca-certificates=2025.2.25=h06a4308_0
+  - expat=2.7.1=h6a678d5_0
+  - ld_impl_linux-64=2.40=h12ee557_0
+  - libffi=3.4.4=h6a678d5_1
+  - libgcc-ng=11.2.0=h1234567_1
+  - libgomp=11.2.0=h1234567_1
+  - libstdcxx-ng=11.2.0=h1234567_1
+  - libuuid=1.41.5=h5eee18b_0
+  - libxcb=1.17.0=h9b100fa_0
+  - ncurses=6.4=h6a678d5_0
+  - openssl=3.0.16=h5eee18b_0
+  - pip=25.1=pyhc872135_2
+  - pthread-stubs=0.3=h0ce48e5_1
+  - python=3.11.13=h1a3bd86_0
+  - readline=8.2=h5eee18b_0
+  - setuptools=78.1.1=py311h06a4308_0
+  - sqlite=3.45.3=h5eee18b_0
+  - tk=8.6.14=h993c535_1
+  - wheel=0.45.1=py311h06a4308_0
+  - xorg-libx11=1.8.12=h9b100fa_1
+  - xorg-libxau=1.0.12=h9b100fa_0
+  - xorg-libxdmcp=1.1.5=h9b100fa_0
+  - xorg-xorgproto=2024.1=h5eee18b_1
+  - xz=5.6.4=h5eee18b_1
+  - zlib=1.2.13=h5eee18b_1
+  - pip:
+      - accelerate==1.7.0
+      - aiohappyeyeballs==2.6.1
+      - aiohttp==3.12.9
+      - aiosignal==1.3.2
+      - airportsdata==20250523
+      - annotated-types==0.7.0
+      - anthropic==0.54.0
+      - antlr4-python3-runtime==4.13.2
+      - anyio==4.9.0
+      - astor==0.8.1
+      - asttokens==3.0.0
+      - attrs==25.3.0
+      - av==14.4.0
+      - bitsandbytes==0.46.0
+      - blake3==1.0.5
+      - cachetools==6.0.0
+      - certifi==2025.4.26
+      - charset-normalizer==3.4.2
+      - click==8.2.1
+      # - clip==1.0
+      - cloudpickle==3.1.1
+      - compressed-tensors==0.9.4
+      - contourpy==1.3.2
+      - cupy-cuda12x==13.4.1
+      - cycler==0.12.1
+      - datasets==3.6.0
+      - decorator==5.2.1
+      - deepspeed==0.15.4
+      - depyf==0.18.0
+      # - diffusers==0.34.0.dev0
+      - dill==0.3.8
+      - diskcache==5.6.3
+      - distro==1.9.0
+      - dnspython==2.7.0
+      - docker-pycreds==0.4.0
+      - einops==0.8.1
+      - email-validator==2.2.0
+      - executing==2.2.0
+      - fastapi==0.115.12
+      - fastapi-cli==0.0.7
+      - fastrlock==0.8.3
+      - filelock==3.18.0
+      # - flash-attn==2.7.4.post1
+      - fonttools==4.58.4
+      - frozenlist==1.6.2
+      - fsspec==2025.3.0
+      - ftfy==6.3.1
+      - gguf==0.17.0
+      - gitdb==4.0.12
+      - gitpython==3.1.44
+      - googleapis-common-protos==1.70.0
+      - grpcio==1.72.1
+      - h11==0.16.0
+      - hf-transfer==0.1.9
+      - hf-xet==1.1.3
+      - hjson==3.1.0
+      - httpcore==1.0.9
+      - httptools==0.6.4
+      - httpx==0.28.1
+      - huggingface-hub==0.32.4
+      - idna==3.10
+      - importlib-metadata==8.7.0
+      - inquirerpy==0.3.4
+      - interegular==0.3.3
+      - ipython==9.3.0
+      - ipython-pygments-lexers==1.1.1
+      - jedi==0.19.2
+      - jinja2==3.1.6
+      - jiter==0.10.0
+      - jsonschema==4.24.0
+      - jsonschema-specifications==2025.4.1
+      - kiwisolver==1.4.8
+      - lark==1.2.2
+      - latex2sympy2-extended==1.10.1
+      - liger-kernel==0.5.2
+      - llguidance==0.7.29
+      - llvmlite==0.44.0
+      - lm-format-enforcer==0.10.11
+      - markdown-it-py==3.0.0
+      - markupsafe==3.0.2
+      - math-verify==0.7.0
+      - matplotlib==3.10.3
+      - matplotlib-inline==0.1.7
+      - mdurl==0.1.2
+      - mistral-common==1.5.6
+      - mpmath==1.3.0
+      - msgpack==1.1.0
+      - msgspec==0.19.0
+      - multidict==6.4.4
+      - multiprocess==0.70.16
+      - nest-asyncio==1.6.0
+      - networkx==3.5
+      - ninja==1.11.1.4
+      - numba==0.61.2
+      - numpy==2.2.6
+      - nvidia-cublas-cu12==12.6.4.1
+      - nvidia-cuda-cupti-cu12==12.6.80
+      - nvidia-cuda-nvrtc-cu12==12.6.77
+      - nvidia-cuda-runtime-cu12==12.6.77
+      - nvidia-cudnn-cu12==9.5.1.17
+      - nvidia-cufft-cu12==11.3.0.4
+      - nvidia-cufile-cu12==1.11.1.6
+      - nvidia-curand-cu12==10.3.7.77
+      - nvidia-cusolver-cu12==11.7.1.2
+      - nvidia-cusparse-cu12==12.5.4.2
+      - nvidia-cusparselt-cu12==0.6.3
+      - nvidia-nccl-cu12==2.26.2
+      - nvidia-nvjitlink-cu12==12.6.85
+      - nvidia-nvtx-cu12==12.6.77
+      - openai==1.84.0
+      - opencv-python-headless==4.11.0.86
+      - opentelemetry-api==1.34.0
+      - opentelemetry-exporter-otlp==1.34.0
+      - opentelemetry-exporter-otlp-proto-common==1.34.0
+      - opentelemetry-exporter-otlp-proto-grpc==1.34.0
+      - opentelemetry-exporter-otlp-proto-http==1.34.0
+      - opentelemetry-proto==1.34.0
+      - opentelemetry-sdk==1.34.0
+      - opentelemetry-semantic-conventions==0.55b0
+      - opentelemetry-semantic-conventions-ai==0.4.9
+      - outlines==0.1.11
+      - outlines-core==0.1.26
+      - packaging==25.0
+      - pandas==2.3.0
+      - parso==0.8.4
+      - partial-json-parser==0.2.1.1.post5
+      - peft==0.17.1
+      - pexpect==4.9.0
+      - pfzy==0.3.4
+      - pillow==11.2.1
+      - platformdirs==4.3.8
+      - prometheus-client==0.22.1
+      - prometheus-fastapi-instrumentator==7.1.0
+      - prompt-toolkit==3.0.51
+      - propcache==0.3.1
+      - protobuf==5.29.5
+      - psutil==7.0.0
+      - ptyprocess==0.7.0
+      - pure-eval==0.2.3
+      - py-cpuinfo==9.0.0
+      - pyarrow==20.0.0
+      - pycountry==24.6.1
+      - pydantic==2.11.5
+      - pydantic-core==2.33.2
+      - pygments==2.19.1
+      - pyparsing==3.2.3
+      - python-dateutil==2.9.0.post0
+      - python-dotenv==1.1.0
+      - python-json-logger==3.3.0
+      - python-multipart==0.0.20
+      - pytz==2025.2
+      - pyyaml==6.0.2
+      - pyzmq==26.4.0
+      - qwen-vl-utils==0.0.11
+      - ray==2.46.0
+      - referencing==0.36.2
+      - regex==2024.11.6
+      - requests==2.32.3
+      - rich==14.0.0
+      - rich-toolkit==0.14.7
+      - rpds-py==0.25.1
+      - safetensors==0.5.3
+      - scipy==1.15.3
+      - seaborn==0.13.2
+      - sentencepiece==0.2.0
+      - sentry-sdk==2.29.1
+      - setproctitle==1.3.6
+      - shellingham==1.5.4
+      - six==1.17.0
+      - smmap==5.0.2
+      - sniffio==1.3.1
+      - stack-data==0.6.3
+      - starlette==0.46.2
+      - sympy==1.14.0
+      - tabulate==0.9.0
+      - tiktoken==0.9.0
+      - timm==0.6.13
+      - tokenizers==0.21.1
+      - torch==2.7.0
+      - torchaudio==2.7.0
+      - torchvision==0.22.0
+      - tqdm==4.67.1
+      - traitlets==5.14.3
+      - transformers==4.51.3
+      - triton==3.3.0
+      - trl==0.19.0
+      - typer==0.16.0
+      - typing-extensions==4.14.0
+      - typing-inspection==0.4.1
+      - tzdata==2025.2
+      - urllib3==2.4.0
+      - utils==1.0.2
+      - uvicorn==0.34.3
+      - uvloop==0.21.0
+      - vllm==0.9.0.1
+      - wandb==0.18.3
+      - watchfiles==1.0.5
+      - wcwidth==0.2.13
+      - websockets==15.0.1
+      - xformers==0.0.30
+      - xgrammar==0.1.19
+      - xxhash==3.5.0
+      - yarl==1.20.0
+      - zipp==3.22.0
+      - tensorboardX==2.6.4

eval.py ADDED Viewed

	@@ -0,0 +1,367 @@

+#!/usr/bin/env python3
+"""Batch image evaluation tool with YAML configuration."""
+import requests
+import pickle
+from PIL import Image
+from typing import List, Dict, Any, Union, Optional, Tuple
+import sys
+import os
+import json
+import yaml
+from io import BytesIO
+from tqdm import tqdm
+from datetime import datetime
+PAIR_SCORERS = {"editreward"}
+CAPTION_SUFFIXES = ["_caption.txt", "_prompt.txt"]
+class RewardEvaluatorClient:
+    def __init__(self, scorer_urls: Dict[str, str]):
+        self.scorer_urls = scorer_urls
+    def evaluate(self,
+                 model_name: str,
+                 images: Union[List[Image.Image], Dict[str, List[Image.Image]]],
+                 prompts: List[str],
+                 metadata: Dict[str, Any] = None) -> Union[List[float], Dict[str, Any]]:
+        url = self.scorer_urls.get(model_name)
+        if not url:
+            raise ValueError(f"Reward model '{model_name}' URL not configured.")
+        payload_bytes = create_payload(images, prompts, metadata)
+        try:
+            response = requests.post(url, data=payload_bytes, timeout=600)
+            response.raise_for_status()
+            result = parse_response(response.content)
+            if isinstance(result, dict) and "error" in result:
+                raise RuntimeError(f"Scorer '{model_name}' returned error: {result['error']}")
+            return result
+        except requests.exceptions.RequestException as e:
+            raise RuntimeError(f"HTTP request to '{model_name}' failed: {e}")
+        except Exception as e:
+            raise RuntimeError(f"Failed to process response from '{model_name}': {e}")
+def serialize_images(images: List[Image.Image]) -> List[bytes]:
+    images_bytes = []
+    for img in images:
+        img_byte_arr = BytesIO()
+        if img.mode != 'RGB':
+            img = img.convert('RGB')
+        img.save(img_byte_arr, format="JPEG")
+        images_bytes.append(img_byte_arr.getvalue())
+    return images_bytes
+def create_payload(images: Union[List[Image.Image], Dict[str, List[Image.Image]]],
+                   prompts: List[str],
+                   metadata: Dict[str, Any] = None) -> bytes:
+    if isinstance(images, dict):
+        serialized_images = {key: serialize_images(value) for key, value in images.items()}
+    else:
+        serialized_images = serialize_images(images)
+    return pickle.dumps({
+        "images": serialized_images,
+        "prompts": prompts,
+        "metadata": metadata or {}
+    })
+def parse_response(response_content: bytes) -> Union[List[float], Dict[str, Any]]:
+    return pickle.loads(response_content)
+def find_caption_file(base_path: str, base_name: str) -> Optional[str]:
+    for suffix in CAPTION_SUFFIXES:
+        caption_path = os.path.join(base_path, f"{base_name}{suffix}")
+        if os.path.exists(caption_path):
+            return caption_path
+    return None
+def collect_standard_samples(folder_path: str) -> Tuple[List[Image.Image], List[str], List[str]]:
+    images, prompts, filenames = [], [], []
+    for file in sorted(os.listdir(folder_path)):
+        if not file.lower().endswith(('.png', '.jpg', '.jpeg')):
+            continue
+        if any(suffix in file for suffix in ['_edited', '_reference', '_source']):
+            continue
+        base_name = os.path.splitext(file)[0]
+        img_path = os.path.join(folder_path, file)
+        caption_path = find_caption_file(folder_path, base_name)
+        if not caption_path:
+            continue
+        try:
+            img = Image.open(img_path)
+            with open(caption_path, 'r', encoding='utf-8') as f:
+                prompt = f.read().strip()
+            images.append(img)
+            prompts.append(prompt)
+            filenames.append(file)
+        except Exception as e:
+            print(f"  Warning: Failed to process {file}: {e}")
+    return images, prompts, filenames
+def collect_edit_samples(folder_path: str) -> Tuple[Dict[str, List[Image.Image]], List[str], List[str]]:
+    source_images, edited_images, prompts, filenames = [], [], [], []
+    edited_files = [f for f in os.listdir(folder_path) if f.endswith('_edited.png')]
+    for edited_file in sorted(edited_files):
+        base_name = edited_file.replace('_edited.png', '')
+        source_file = f"{base_name}_reference.png"
+        if not os.path.exists(os.path.join(folder_path, source_file)):
+            source_file = f"{base_name}_source.png"
+        source_path = os.path.join(folder_path, source_file)
+        edited_path = os.path.join(folder_path, edited_file)
+        caption_path = find_caption_file(folder_path, base_name)
+        if not os.path.exists(source_path) or not caption_path:
+            continue
+        try:
+            source_img = Image.open(source_path)
+            edited_img = Image.open(edited_path)
+            with open(caption_path, 'r', encoding='utf-8') as f:
+                prompt = f.read().strip()
+            source_images.append(source_img)
+            edited_images.append(edited_img)
+            prompts.append(prompt)
+            filenames.append(base_name)
+        except Exception as e:
+            print(f"  Warning: Failed to process {base_name}: {e}")
+    return {'source': source_images, 'edited': edited_images}, prompts, filenames
+def evaluate_folder(folder_path: str,
+                    model_name: str,
+                    batch_size: int,
+                    scorer_urls: Dict[str, str],
+                    verbose: bool = True) -> Optional[Dict[str, Any]]:
+    if not os.path.isdir(folder_path):
+        return None
+    evaluator = RewardEvaluatorClient(scorer_urls)
+    is_pair_scorer = model_name in PAIR_SCORERS
+    if is_pair_scorer:
+        images, prompts, filenames = collect_edit_samples(folder_path)
+        sample_count = len(prompts)
+    else:
+        images, prompts, filenames = collect_standard_samples(folder_path)
+        sample_count = len(images)
+    if sample_count == 0:
+        if verbose:
+            print(f"  Skipped (no valid samples): {folder_path}")
+        return None
+    if verbose:
+        print(f"  Evaluating {sample_count} samples: {folder_path}")
+    all_scores = []
+    if is_pair_scorer:
+        source_images = images['source']
+        edited_images = images['edited']
+        for start_idx in tqdm(range(0, sample_count, batch_size), disable=not verbose):
+            end_idx = min(start_idx + batch_size, sample_count)
+            batch_images = {
+                'source': source_images[start_idx:end_idx],
+                'edited': edited_images[start_idx:end_idx]
+            }
+            batch_prompts = prompts[start_idx:end_idx]
+            try:
+                batch_results = evaluator.evaluate(model_name, batch_images, batch_prompts)
+                scores = batch_results.get('scores', batch_results) if isinstance(batch_results, dict) else batch_results
+                all_scores.extend(scores)
+            except Exception as e:
+                print(f"    Batch evaluation failed [{start_idx}:{end_idx}]: {e}")
+                return None
+    else:
+        for start_idx in tqdm(range(0, sample_count, batch_size), disable=not verbose):
+            end_idx = min(start_idx + batch_size, sample_count)
+            batch_images = images[start_idx:end_idx]
+            batch_prompts = prompts[start_idx:end_idx]
+            try:
+                batch_results = evaluator.evaluate(model_name, batch_images, batch_prompts)
+                scores = batch_results.get('scores', batch_results) if isinstance(batch_results, dict) else batch_results
+                all_scores.extend(scores)
+            except Exception as e:
+                print(f"    Batch evaluation failed [{start_idx}:{end_idx}]: {e}")
+                continue
+    if not all_scores:
+        return None
+    return {
+        'folder': folder_path,
+        'model': model_name,
+        'average': sum(all_scores) / len(all_scores),
+        'scores': all_scores,
+        'count': len(all_scores)
+    }
+def find_leaf_folders(root_path: str, min_depth: int = 0, max_depth: int = -1) -> List[str]:
+    result = []
+    root_path = os.path.abspath(root_path)
+    def has_images(folder: str) -> bool:
+        for f in os.listdir(folder):
+            if f.lower().endswith(('.png', '.jpg', '.jpeg')):
+                return True
+        return False
+    def recurse(current_path: str, depth: int):
+        if max_depth >= 0 and depth > max_depth:
+            return
+        try:
+            entries = os.listdir(current_path)
+        except PermissionError:
+            return
+        subdirs = [e for e in entries if os.path.isdir(os.path.join(current_path, e))]
+        if not subdirs or (max_depth >= 0 and depth == max_depth):
+            if depth >= min_depth and has_images(current_path):
+                result.append(current_path)
+        else:
+            for subdir in subdirs:
+                recurse(os.path.join(current_path, subdir), depth + 1)
+            if depth >= min_depth and has_images(current_path):
+                result.append(current_path)
+    recurse(root_path, 0)
+    return sorted(result)
+def run(config: Dict[str, Any]) -> Dict[str, Dict[str, Any]]:
+    scorer_urls = config['scorer_urls']
+    defaults = config.get('defaults', {})
+    evaluations = config['evaluations']
+    output_file = config.get('output')
+    verbose = config.get('verbose', True)
+    default_batch_size = defaults.get('batch_size', 64)
+    default_recursive = defaults.get('recursive', False)
+    default_min_depth = defaults.get('min_depth', 0)
+    default_max_depth = defaults.get('max_depth', -1)
+    all_results = {}
+    for eval_item in evaluations:
+        path = eval_item.get('path')
+        if not path:
+            print("Warning: Evaluation item missing 'path', skipping")
+            continue
+        models = eval_item.get('models', [])
+        if not models:
+            print(f"Warning: No models specified for {path}, skipping")
+            continue
+        batch_size = eval_item.get('batch_size', default_batch_size)
+        recursive = eval_item.get('recursive', default_recursive)
+        min_depth = eval_item.get('min_depth', default_min_depth)
+        max_depth = eval_item.get('max_depth', default_max_depth)
+        if not recursive:
+            max_depth = 0
+        folders = find_leaf_folders(path, min_depth, max_depth)
+        if not folders:
+            print(f"No image folders found in: {path}")
+            continue
+        print(f"\nProcessing {len(folders)} folder(s) from: {path}")
+        print(f"Models: {', '.join(models)}")
+        print("-" * 60)
+        for folder in tqdm(folders, desc="Folders", disable=not verbose):
+            folder_results = {}
+            for model in models:
+                if verbose:
+                    print(f"\n[{model}] ", end="")
+                result = evaluate_folder(folder, model, batch_size, scorer_urls, verbose)
+                if result:
+                    folder_results[model] = result
+                    if verbose:
+                        print(f"    -> Average: {result['average']:.4f} (n={result['count']})")
+            if folder_results:
+                rel_path = os.path.relpath(folder, path)
+                key = f"{path}:{rel_path}" if rel_path != "." else path
+                all_results[key] = folder_results
+    # Print summary
+    print("\n" + "=" * 60)
+    print("Evaluation Summary")
+    print("=" * 60)
+    for folder, results in all_results.items():
+        print(f"\n{folder}")
+        for model, data in results.items():
+            print(f"   [{model}] avg={data['average']:.4f}, n={data['count']}")
+    # Save results
+    if output_file:
+        serializable = {
+            folder: {
+                model: {'average': data['average'], 'count': data['count']}
+                for model, data in results.items()
+            }
+            for folder, results in all_results.items()
+        }
+        with open(output_file, 'w', encoding='utf-8') as f:
+            json.dump({
+                'timestamp': datetime.now().isoformat(),
+                'results': serializable
+            }, f, indent=2, ensure_ascii=False)
+        print(f"\nResults saved to: {output_file}")
+    return all_results
+def main():
+    if len(sys.argv) != 2:
+        print(f"Usage: python {sys.argv[0]} <config.yaml>")
+        sys.exit(1)
+    config_path = sys.argv[1]
+    with open(config_path, 'r', encoding='utf-8') as f:
+        config = yaml.safe_load(f)
+    results = run(config)
+    sys.exit(0 if results else 1)
+if __name__ == "__main__":
+    main()

gen.sh ADDED Viewed

	@@ -0,0 +1,48 @@

+#!/bin/bash
+set -e
+# Download eval datasets if not present
+EDIT_DATA="data/omni_edit_dev.parquet"
+if [ ! -f "$EDIT_DATA" ]; then
+    echo "Downloading edit eval dataset..."
+    mkdir -p data
+    huggingface-cli download wangfuyun/PrompRL data/omni_edit_dev.parquet \
+        --repo-type model --local-dir . --local-dir-use-symlinks False
+fi
+# # Text-to-Image OCR
+python unified_inference.py --mode t2i \
+    --model_path wangfuyun/PrompRL/promptrl_ocr \
+    --model_type flux \
+    --prompt_file prompts/ocr_test.txt \
+    --output_dir outputs/ocr \
+    --use_cot --cot_template ocr_clarity_v2
+# # Text-to-Image PS
+python unified_inference.py --mode t2i \
+    --model_path wangfuyun/PrompRL/promptrl_ps \
+    --model_type flux \
+    --prompt_file prompts/draw_test.txt \
+    --output_dir outputs/pickscore \
+    --use_cot --cot_template quality_purev2
+# # GenEval
+python unified_inference.py --mode geneval \
+    --model_path wangfuyun/PrompRL/promptrl_geneval \
+    --model_type flux \
+    --metadata_file prompts/evaluation_metadata.jsonl \
+    --output_dir outputs/geneval \
+    --use_cot --cot_template geneval \
+    --n_samples 4
+# # Image Editing
+python unified_inference.py --mode edit \
+    --model_path wangfuyun/PrompRL/promptrl_edit \
+    --model_type kontext \
+    --data_file "$EDIT_DATA" \
+    --output_dir outputs/edit \
+    --use_cot --cot_template edit_general \
+    --guidance_scale 2.5
+# python eval.py prompts/config.yaml

prompts/config.yaml ADDED Viewed

	@@ -0,0 +1,35 @@

+# Batch Image Evaluator Configuration
+scorer_urls:
+  aesthetic: "http://YOUR_SERVER_IP:18080/"
+  image_reward: "http://YOUR_SERVER_IP:18081/"
+  ocr: "http://YOUR_SERVER_IP:18082/"
+  pickscore: "http://YOUR_SERVER_IP:18083/"
+  deqa: "http://YOUR_SERVER_IP:18084/"
+  gen_eval: "http://YOUR_SERVER_IP:18085/"
+  unifiedreward_sglang: "http://YOUR_SERVER_IP:18086/"
+  hps: "http://YOUR_SERVER_IP:18087/"
+  editreward: "http://YOUR_SERVER_IP:18088/"
+defaults:
+  batch_size: 64
+  recursive: false
+  min_depth: 0
+  max_depth: -1
+output: results.json
+verbose: true
+evaluations:
+  - path: ./outputs/ocr
+    models: [ocr]
+    batch_size: 32
+    recursive: true
+  - path: ./outputs/edit
+    models: [editreward]
+    batch_size: 32
+  - path: ./outputs/pickscore
+    models: [pickscore]
+    batch_size: 32

prompts/draw_test.txt ADDED Viewed

	@@ -0,0 +1,1000 @@

+New York Skyline with 'Google Research Pizza Cafe' written with fireworks on the sky.
+A maglev train going vertically downward in high speed, New York Times photojournalism.
+A pyramid made of falafel with a partial solar eclipse in the background.
+A storefront with 'Google Brain Toronto' written on it.
+An elephant under the sea.
+Lego Arnold Schwarzenegger.
+A keyboard made of water, the water is made of light, the light is turned off.
+Artophagous.
+One cat and one dog sitting on the grass.
+A laptop on top of a teddy bear.
+A red colored car.
+A stack of 3 books. A green book is on the top, sitting on a red book. The red book is in the middle, sitting on a blue book. The blue book is on the bottom.
+A green colored banana.
+Matutinal.
+A green cup and a blue cell phone.
+A stack of 3 plates. A blue plate is on the top, sitting on a blue plate. The blue plate is in the middle, sitting on a green plate. The green plate is on the bottom.
+A large thick-skinned semiaquatic African mammal, with massive jaws and large tusks.
+A red colored banana.
+Jentacular.
+A sign that says 'Hello World'.
+A blue cup and a green cell phone.
+A black colored banana.
+Two cats and two dogs sitting on the grass.
+A ldarge keybord msical instroument lwith a woden case enmclosig a qsouvnkboajrd and mfgtal strivgf, which are strucrk b hammrs when the nels are depresdsmed.f lhe strsingsj' vibration ie stopped by damperds when the keys re released and can bce regulavewdd for lengh and vnolume y two or three pedalvs.
+A magnifying glass over a page of a 1950s batman comic.
+A separate seat for one person, typically with a back and four legs.
+Two dogs on the street.
+New York Skyline with 'Diffusion' written with fireworks on the sky.
+A black colored banana.
+An ancient Egyptian painting depicting an argument over whose turn it is to take out the trash.
+A wine glass on top of a dog.
+An emoji of a baby panda wearing a red hat, green gloves, red shirt, and green pants.
+A pear cut into seven pieces arranged in a ring.
+A large thick-skinned semiaquatic African mammal, with massive jaws and large tusks.
+A baby fennec sneezing onto a strawberry, detailed, macro, studio light, droplets, backlit ears.
+A panda making latte art.
+An IT-guy trying to fix hardware of a PC tower is being tangled by the PC cables like Laokoon. Marble, copy after Hellenistic original from ca. 200 BC. Found in the Baths of Trajan, 1506.
+A blue bird and a brown bear.
+A triangular purple flower pot. A purple flower pot in the shape of a triangle.
+A green apple and a black backpack.
+A grocery store refrigerator has pint cartons of milk on the top shelf, quart cartons on the middle shelf, and gallon plastic jugs on the bottom shelf.
+A small domesticated carnivorous mammal with soft fur, a short snout, and retractable claws. It is widely kept as a pet or for catching mice, and many breeds have been developed.
+An orange colored sandwich.
+A large motor vehicle carrying passengers by road, typically one serving the public on a fixed route and for a fare.
+A sphere made of kitchen tile. A sphere with the texture of kitchen tile.
+A cat on the right of a tennis racket.
+Bzaseball galove.
+A sign that says 'NeurIPS'.
+A 1960s yearbook photo with animals dressed as humans.
+New York Skyline with 'Hello World' written with fireworks on the sky.
+Hovering cow abducting aliens.
+A small vessel propelled on water by oars, sails, or an engine.
+A type of digital currency in which a record of transactions is maintained and new units of currency are generated by the computational solution of mathematical problems, and which operates independently of a central bank.
+A pink colored car.
+A storefront with 'NeurIPS' written on it.
+A black apple and a green backpack.
+A large motor vehicle carrying passengers by road, typically one serving the public on a fixed route and for a fare.
+A long curved fruit which grows in clusters and has soft pulpy flesh and yellow skin when ripe.
+A black colored car.
+A realistic photo of a Pomeranian dressed up like a 1980s professional wrestler with neon green and neon orange face paint and bright green wrestling tights with bright orange boots.
+Tcennis rpacket.
+McDonalds Church.
+Painting of Mona Lisa but the view is from behind of Mona Lisa.
+An elephant is behind a tree. You can see the trunk on one side and the back legs on the other.
+Hovering cow abducting aliens.
+Photo of a mega Lego space station inside a kid's bedroom.
+An elephant under the sea.
+One cat and two dogs sitting on the grass.
+A green colored banana.
+An American multinational technology company that focuses on artificial intelligence, search engine, online advertising, cloud computing, computer software, quantum computing, e-commerce, and consumer electronics.
+A domesticated carnivvorous mzammal that typicbally hfaas a lons sfnout, an acxujte sense off osmell, noneetractaaln crlaws, anid xbarkring,y howlingu, or whining rvoiche.
+Jentacular.
+A wine glass on top of a dog.
+A carrot on the left of a broccoli.
+Pafrking metr.
+Three cars on the street.
+In late afternoon in January in New England, a man stands in the shadow of a maple tree.
+An oil painting portrait of the regal Burger King posing with a Whopper.
+A sign that says 'Text to Image'.
+A small vessel propelled on water by oars, sails, or an engine.
+A single clock is sitting on a table.
+A stack of 3 plates. A blue plate is on the top, sitting on a blue plate. The blue plate is in the middle, sitting on a green plate. The green plate is on the bottom.
+An elephant under the sea.
+A type of digital currency in which a record of transactions is maintained and new units of currency are generated by the computational solution of mathematical problems, and which operates independently of a central bank.
+A yellow colored giraffe.
+An elephant is behind a tree. You can see the trunk on one side and the back legs on the other.
+A device consisting of a circular canopy of cloth on a folding metal frame supported by a central rod, used as protection against rain or sometimes sun.
+A fluffy baby sloth with a knitted hat trying to figure out a laptop, close up, highly detailed, studio lighting, screen reflecting in its eyes.
+A stack of 3 plates. A blue plate is on the top, sitting on a blue plate. The blue plate is in the middle, sitting on a green plate. The green plate is on the bottom.
+Three cats and three dogs sitting on the grass.
+A large keyboard musical instrument with a wooden case enclosing a soundboard and metal strings, which are struck by hammers when the keys are depressed. The strings' vibration is stopped by dampers when the keys are released and can be regulated for length and volume by two or three pedals.
+A blue coloured pizza.
+A storefront with 'Google Research Pizza Cafe' written on it.
+A sjmall domesticated carnivorious mammnal with sof fuh,y a sthort sout, and retracwtablbe flaws. It iw widexly kept as a pet or for catchitng mic, ad many breeds zhlyde beefn develvoked.
+A green apple and a black backpack.
+A pink colored car.
+A pear cut into seven pieces arranged in a ring.
+A screenshot of an iOS app for ordering different types of milk.
+Rbefraigerator.
+A blue colored dog.
+Two cats and two dogs sitting on the grass.
+A real life photography of super mario, 8k Ultra HD.
+New York Skyline with 'Hello World' written with fireworks on the sky.
+A realistic photo of a Pomeranian dressed up like a 1980s professional wrestler with neon green and neon orange face paint and bright green wrestling tights with bright orange boots.
+A panda making latte art.
+A storefront with 'NeurIPS' written on it.
+A large keyboard musical instrument with a wooden case enclosing a soundboard and metal strings, which are struck by hammers when the keys are depressed. The strings' vibration is stopped by dampers when the keys are released and can be regulated for length and volume by two or three pedals.
+A blue colored dog.
+Three cats and two dogs sitting on the grass.
+New York Skyline with 'Google Brain Toronto' written with fireworks on the sky.
+A blue coloured pizza.
+A panda making latte art.
+An American multinational technology company that focuses on artificial intelligence, search engine, online advertising, cloud computing, computer software, quantum computing, e-commerce, and consumer electronics.
+Backlotter.
+A black colored sandwich.
+A large thick-skinned semiaquatic African mammal, with massive jaws and large tusks.
+A domesticated carnivorous mammal that typically has a long snout, an acute sense of smell, nonretractable claws, and a barking, howling, or whining voice.
+New York Skyline with 'Deep Learning' written with fireworks on the sky.
+A black colored dog.
+A stack of 3 cubes. A red cube is on the top, sitting on a red cube. The red cube is in the middle, sitting on a green cube. The green cube is on the bottom.
+A type of digital currency in which a record of transactions is maintained and new units of currency are generated by the computational solution of mathematical problems, and which operates independently of a central bank.
+Five cars on the street.
+An old photograph of a 1920s airship shaped like a pig, floating over a wheat field.
+Illustration of a mouse using a mushroom as an umbrella.
+Three cats and one dog sitting on the grass.
+Four cars on the street.
+A black colored sandwich.
+Five cars on the street.
+An American multinational technology company that focuses on artificial intelligence, search engine, online advertising, cloud computing, computer software, quantum computing, e-commerce, and consumer electronics.
+A sign that says 'Google Brain Toronto'.
+A storefront with 'Text to Image' written on it.
+A magnifying glass over a page of a 1950s batman comic.
+A sphere made of kitchen tile. A sphere with the texture of kitchen tile.
+An IT-guy trying to fix hardware of a PC tower is being tangled by the PC cables like Laokoon. Marble, copy after Hellenistic original from ca. 200 BC. Found in the Baths of Trajan, 1506.
+A sign that says 'Diffusion'.
+A blue bird and a brown bear.
+A photo of a confused grizzly bear in calculus class.
+A grocery store refrigerator has pint cartons of milk on the top shelf, quart cartons on the middle shelf, and gallon plastic jugs on the bottom shelf.
+A hair drier underneath a sheep.
+Pafrking metr.
+Peristeronic.
+Two cats and one dog sitting on the grass.
+New York Skyline with 'Google Research Pizza Cafe' written with fireworks on the sky.
+A side view of an owl sitting in a field.
+A pink colored car.
+Paying for a quarter-sized pizza with a pizza-sized quarter.
+Dininrg tablez.
+A fish eating a pelican.
+One cat and three dogs sitting on the grass.
+An instrument used for cutting cloth, paper, and other thin material, consisting of two blades laid one on top of the other and fastened in the middle so as to allow them to be opened and closed by a thumb and finger inserted through rings on the end of their handles.
+A side view of an owl sitting in a field.
+A large thick-skinned semiaquatic African mammal, with massive jaws and large tusks.
+A large keyboard musical instrument with a wooden case enclosing a soundboard and metal strings, which are struck by hammers when the keys are depressed. The strings' vibration is stopped by dampers when the keys are released and can be regulated for length and volume by two or three pedals.
+Pafrking metr.
+A sign that says 'Deep Learning'.
+A collection of nail is sitting on a table.
+One car on the street.
+An emoji of a baby panda wearing a red hat, blue gloves, green shirt, and blue pants.
+A brown bird and a blue bear.
+A donkey and an octopus are playing a game. The donkey is holding a rope on one end, the octopus is holding onto the other. The donkey holds the rope in its mouth. A cat is jumping over the rope.
+A fisheye lens view of a turtle sitting in a forest.
+A large motor vehicle carrying passengers by road, typically one serving the public on a fixed route and for a fare.
+New York Skyline with 'Hello World' written with fireworks on the sky.
+An emoji of a baby panda wearing a red hat, green gloves, red shirt, and green pants.
+A black colored dog.
+A ldarge keybord msical instroument lwith a woden case enmclosig a qsouvnkboajrd and mfgtal strivgf, which are strucrk b hammrs when the nels are depresdsmed.f lhe strsingsj' vibration ie stopped by damperds when the keys re released and can bce regulavewdd for lengh and vnolume y two or three pedalvs.
+Artophagous.
+A yellow book and a red vase.
+A stack of 3 books. A green book is on the top, sitting on a red book. The red book is in the middle, sitting on a blue book. The blue book is on the bottom.
+A pizza on the right of a suitcase.
+A tiger in a lab coat with a 1980s Miami vibe, turning a well oiled science content machine, digital art.
+A storefront with 'Hello World' written on it.
+A tiger in a lab coat with a 1980s Miami vibe, turning a well oiled science content machine, digital art.
+A storefront with 'Google Brain Toronto' written on it.
+A 1960s poster warning against climate change.
+An organ of soft nervous tissue contained in the skull of vertebrates, functioning as the coordinating center of sensation and intellectual and nervous activity.
+A long curved fruit which grows in clusters and has soft pulpy flesh and yellow skin when ripe.
+Supreme Court Justices play a baseball game with the FBI. The FBI is at bat, the justices are on the field.
+A pyramid made of falafel with a partial solar eclipse in the background.
+A single clock is sitting on a table.
+New York Skyline with 'Google Research Pizza Cafe' written with fireworks on the sky.
+A blue cup and a green cell phone.
+An oil painting of a couple in formal evening wear going home get caught in a heavy downpour with no umbrellas.
+Darth Vader playing with raccoon in Mars during sunset.
+A red car and a white sheep.
+An illustration of a large red elephant sitting on a small blue mouse.
+An illustration of a small green elephant standing behind a large red mouse.
+A domesticated carnivorous mammal that typically has a long snout, an acute sense of smell, nonretractable claws, and a barking, howling, or whining voice.
+A medieval painting of the wifi not working.
+An American multinational technology company that focuses on artificial intelligence, search engine, online advertising, cloud computing, computer software, quantum computing, e-commerce, and consumer electronics.
+One cat and two dogs sitting on the grass.
+An IT-guy trying to fix hardware of a PC tower is being tangled by the PC cables like Laokoon. Marble, copy after Hellenistic original from ca. 200 BC. Found in the Baths of Trajan, 1506.
+A fluffy baby sloth with a knitted hat trying to figure out a laptop, close up, highly detailed, studio lighting, screen reflecting in its eyes.
+Abraham Lincoln touches his toes while George Washington does chin-ups. Lincoln is barefoot. Washington is wearing boots.
+An umbrella on top of a spoon.
+Matutinal.
+A pink colored giraffe.
+An emoji of a baby panda wearing a red hat, green gloves, red shirt, and green pants.
+Illustration of a mouse using a mushroom as an umbrella.
+A brown bird and a blue bear.
+A painting by Grant Wood of an astronaut couple, american gothic style.
+A sign that says 'Diffusion'.
+Five dogs on the street.
+Four dogs on the street.
+A cat on the left of a dog.
+A zebra underneath a broccoli.
+A banana on the left of an apple.
+Two cats and three dogs sitting on the grass.
+A yellow colored giraffe.
+Three cats and one dog sitting on the grass.
+A ldarge keybord msical instroument lwith a woden case enmclosig a qsouvnkboajrd and mfgtal strivgf, which are strucrk b hammrs when the nels are depresdsmed.f lhe strsingsj' vibration ie stopped by damperds when the keys re released and can bce regulavewdd for lengh and vnolume y two or three pedalvs.
+Abraham Lincoln touches his toes while George Washington does chin-ups. Lincoln is barefoot. Washington is wearing boots.
+A yellow book and a red vase.
+A cat on the left of a dog.
+A stop sign on the right of a refrigerator.
+A shark in the desert.
+Octothorpe.
+A red colored car.
+Four cars on the street.
+A tiger in a lab coat with a 1980s Miami vibe, turning a well oiled science content machine, digital art.
+Three cats and one dog sitting on the grass.
+Paying for a quarter-sized pizza with a pizza-sized quarter.
+A zebra to the right of a fire hydrant.
+A stack of 3 cubes. A red cube is on the top, sitting on a red cube. The red cube is in the middle, sitting on a green cube. The green cube is on the bottom.
+A 1960s poster warning against climate change.
+A storefront with 'Google Research Pizza Cafe' written on it.
+A laptop on top of a teddy bear.
+A painting by Grant Wood of an astronaut couple, american gothic style.
+New York Skyline with 'Deep Learning' written with fireworks on the sky.
+A storefront with 'Diffusion' written on it.
+A storefront with 'Text to Image' written on it.
+A small blue book sitting on a large red book.
+Colouring page of large cats climbing the eifel tower in a cyberpunk future.
+An emoji of a baby panda wearing a red hat, blue gloves, green shirt, and blue pants.
+A photo of a confused grizzly bear in calculus class.
+Paying for a quarter-sized pizza with a pizza-sized quarter.
+Painting of the orange cat Otto von Garfield, Count of Bismarck-Schönhausen, Duke of Lauenburg, Minister-President of Prussia. Depicted wearing a Prussian Pickelhaube and eating his favorite meal - lasagna.
+A device consisting of a circular canopy of cloth on a folding metal frame supported by a central rod, used as protection against rain or sometimes sun.
+Supreme Court Justices play a baseball game with the FBI. The FBI is at bat, the justices are on the field.
+A triangular pink stop sign. A pink stop sign in the shape of a triangle.
+Painting of the orange cat Otto von Garfield, Count of Bismarck-Schönhausen, Duke of Lauenburg, Minister-President of Prussia. Depicted wearing a Prussian Pickelhaube and eating his favorite meal - lasagna.
+A train on top of a surfboard.
+A stack of 3 cubes. A red cube is on the top, sitting on a red cube. The red cube is in the middle, sitting on a green cube. The green cube is on the bottom.
+A sjmall domesticated carnivorious mammnal with sof fuh,y a sthort sout, and retracwtablbe flaws. It iw widexly kept as a pet or for catchitng mic, ad many breeds zhlyde beefn develvoked.
+A laptop on top of a teddy bear.
+A train on top of a surfboard.
+A photocopy of a photograph of a painting of a sculpture of a giraffe.
+A 1960s yearbook photo with animals dressed as humans.
+A pink colored giraffe.
+A maglev train going vertically downward in high speed, New York Times photojournalism.
+A domesticated carnivvorous mzammal that typicbally hfaas a lons sfnout, an acxujte sense off osmell, noneetractaaln crlaws, anid xbarkring,y howlingu, or whining rvoiche.
+A sign that says 'Google Research Pizza Cafe'.
+Two cars on the street.
+A tennis racket underneath a traffic light.
+A cross-section view of a brain.
+One cat and one dog sitting on the grass.
+A horse riding an astronaut.
+A car playing soccer, digital art.
+A large plant-eating domesticated mammal with solid hoofs and a flowing mane and tail, used for riding, racing, and to carry and pull loads.
+Three dogs on the street.
+A separate seat for one person, typically with a back and four legs.
+A couple of glasses are sitting on a table.
+A couch on the left of a chair.
+Two cars on the street.
+A photocopy of a photograph of a painting of a sculpture of a giraffe.
+A black apple and a green backpack.
+A pyramid made of falafel with a partial solar eclipse in the background.
+A brown colored giraffe.
+One cat and one dog sitting on the grass.
+A pizza cooking an oven.
+A church with stained glass windows depicting a hamburger and french fries.
+A connection point by which firefighters can tap into a water supply.
+A sign that says 'Google Research Pizza Cafe'.
+35mm macro shot a kitten licking a baby duck, studio lighting.
+New York Skyline with 'Text to Image' written with fireworks on the sky.
+An oil painting portrait of the regal Burger King posing with a Whopper.
+A storefront with 'Google Brain Toronto' written on it.
+A bridge connecting Europe and North America on the Atlantic Ocean, bird's eye view.
+One cat and three dogs sitting on the grass.
+Octothorpe.
+A connection point by which firefighters can tap into a water supply.
+A donut underneath a toilet.
+Colouring page of large cats climbing the eifel tower in a cyberpunk future.
+A panda making latte art.
+A machine next to a parking space in a street, into which the driver puts money so as to be authorized to park the vehicle for a particular length of time.
+New York Skyline with 'Google Brain Toronto' written with fireworks on the sky.
+A real life photography of super mario, 8k Ultra HD.
+A cat on the right of a tennis racket.
+A sign that says 'Diffusion'.
+An illustration of a large red elephant sitting on a small blue mouse.
+A collection of nail is sitting on a table.
+An appliance or compartment which is artificially kept cool and used to store food and drink.
+An oil painting portrait of the regal Burger King posing with a Whopper.
+Abraham Lincoln touches his toes while George Washington does chin-ups. Lincoln is barefoot. Washington is wearing boots.
+A black colored dog.
+One cat and two dogs sitting on the grass.
+A donkey and an octopus are playing a game. The donkey is holding a rope on one end, the octopus is holding onto the other. The donkey holds the rope in its mouth. A cat is jumping over the rope.
+A pink colored giraffe.
+A hair drier underneath a sheep.
+A couch on the left of a chair.
+A cube made of denim. A cube with the texture of denim.
+Jentacular.
+An old photograph of a 1920s airship shaped like a pig, floating over a wheat field.
+Colouring page of large cats climbing the eifel tower in a cyberpunk future.
+A collection of nail is sitting on a table.
+One dog on the street.
+A stack of 3 cubes. A red cube is on the top, sitting on a red cube. The red cube is in the middle, sitting on a green cube. The green cube is on the bottom.
+Illustration of a mouse using a mushroom as an umbrella.
+A zebra to the right of a fire hydrant.
+Two dogs on the street.
+Photo of an athlete cat explaining it's latest scandal at a press conference to journalists.
+A domesticated carnivvorous mzammal that typicbally hfaas a lons sfnout, an acxujte sense off osmell, noneetractaaln crlaws, anid xbarkring,y howlingu, or whining rvoiche.
+A vehicle composed of two wheels held in a frame one behind the other, propelled by pedals and steered with handlebars attached to the front wheel.
+A sign that says 'NeurIPS'.
+A church with stained glass windows depicting a hamburger and french fries.
+A shark in the desert.
+An emoji of a baby panda wearing a red hat, blue gloves, green shirt, and blue pants.
+A machine next to a parking space in a street, into which the driver puts money so as to be authorized to park the vehicle for a particular length of time.
+Artophagous.
+A car on the left of a bus.
+A storefront with 'Google Brain Toronto' written on it.
+A cube made of denim. A cube with the texture of denim.
+A red colored banana.
+Two dogs on the street.
+Five cars on the street.
+A mechanical or electrical device for measuring time.
+Acersecomicke.
+An illustration of a large red elephant sitting on a small blue mouse.
+A triangular pink stop sign. A pink stop sign in the shape of a triangle.
+Peristeronic.
+A keyboard made of water, the water is made of light, the light is turned off.
+Greek statue of a man tripping over a cat.
+Two cats and three dogs sitting on the grass.
+New York Skyline with 'Google Brain Toronto' written with fireworks on the sky.
+Rbefraigerator.
+A storefront with 'Google Research Pizza Cafe' written on it.
+Four cars on the street.
+An oil painting portrait of the regal Burger King posing with a Whopper.
+A fluffy baby sloth with a knitted hat trying to figure out a laptop, close up, highly detailed, studio lighting, screen reflecting in its eyes.
+An oil painting of a couple in formal evening wear going home get caught in a heavy downpour with no umbrellas.
+Painting of the orange cat Otto von Garfield, Count of Bismarck-Schönhausen, Duke of Lauenburg, Minister-President of Prussia. Depicted wearing a Prussian Pickelhaube and eating his favorite meal - lasagna.
+A grocery store refrigerator has pint cartons of milk on the top shelf, quart cartons on the middle shelf, and gallon plastic jugs on the bottom shelf.
+A real life photography of super mario, 8k Ultra HD.
+A carrot on the left of a broccoli.
+Darth Vader playing with raccoon in Mars during sunset.
+Four dogs on the street.
+Photo of a cat singing in a barbershop quartet.
+A real life photography of super mario, 8k Ultra HD.
+A triangular pink stop sign. A pink stop sign in the shape of a triangle.
+A small blue book sitting on a large red book.
+A green colored banana.
+A bicycle on top of a boat.
+A blue cup and a green cell phone.
+A cat on the right of a tennis racket.
+A stop sign on the right of a refrigerator.
+A sign that says 'Diffusion'.
+A blue coloured pizza.
+A device consisting of a circular canopy of cloth on a folding metal frame supported by a central rod, used as protection against rain or sometimes sun.
+A green cup and a blue cell phone.
+Three cats and two dogs sitting on the grass.
+A laptop on top of a teddy bear.
+A medieval painting of the wifi not working.
+A small vessel propelled on water by oars, sails, or an engine.
+Photo of a mega Lego space station inside a kid's bedroom.
+A car on the left of a bus.
+A green colored banana.
+A photo of a confused grizzly bear in calculus class.
+Three dogs on the street.
+A medieval painting of the wifi not working.
+One cat and three dogs sitting on the grass.
+A red colored car.
+Photo of a mega Lego space station inside a kid's bedroom.
+Abraham Lincoln touches his toes while George Washington does chin-ups. Lincoln is barefoot. Washington is wearing boots.
+Photo of a cat singing in a barbershop quartet.
+A tennis racket underneath a traffic light.
+Two cars on the street.
+A sign that says 'Hello World'.
+A church with stained glass windows depicting a hamburger and french fries.
+A horse riding an astronaut.
+A cross-section view of a brain.
+A couple of glasses are sitting on a table.
+A domesticated carnivorous mammal that typically has a long snout, an acute sense of smell, nonretractable claws, and a barking, howling, or whining voice.
+A green cup and a blue cell phone.
+Acersecomicke.
+A giraffe underneath a microwave.
+An elephant is behind a tree. You can see the trunk on one side and the back legs on the other.
+A train on top of a surfboard.
+A banana on the left of an apple.
+A blue cup and a green cell phone.
+A blue colored dog.
+A sphere made of kitchen tile. A sphere with the texture of kitchen tile.
+A couple of glasses are sitting on a table.
+Matutinal.
+An instrument used for cutting cloth, paper, and other thin material, consisting of two blades laid one on top of the other and fastened in the middle so as to allow them to be opened and closed by a thumb and finger inserted through rings on the end of their handles.
+New York Skyline with 'Diffusion' written with fireworks on the sky.
+A white car and a red sheep.
+A sign that says 'NeurIPS'.
+Five cars on the street.
+A red colored dog.
+New York Skyline with 'Text to Image' written with fireworks on the sky.
+New York Skyline with 'Diffusion' written with fireworks on the sky.
+Three cats and three dogs sitting on the grass.
+A storefront with 'Deep Learning' written on it.
+A hair drier underneath a sheep.
+An instqrumemnt used for cutting cloth, paper, axdz othr thdin mteroial, consamistng of two blades lad one on tvopb of the other and fhastned in tle mixdqdjle so as to bllow them txo be pened and closed by thumb and fitngesr inserted tgrough rings on kthe end oc thei vatndlzes.
+One dog on the street.
+A fish eating a pelican.
+A baby fennec sneezing onto a strawberry, detailed, macro, studio light, droplets, backlit ears.
+A maglev train going vertically downward in high speed, New York Times photojournalism.
+Supreme Court Justices play a baseball game with the FBI. The FBI is at bat, the justices are on the field.
+A photo of a confused grizzly bear in calculus class.
+A triangular pink stop sign. A pink stop sign in the shape of a triangle.
+Matutinal.
+Two cars on the street.
+An orange colored sandwich.
+A storefront with 'NeurIPS' written on it.
+A grocery store refrigerator has pint cartons of milk on the top shelf, quart cartons on the middle shelf, and gallon plastic jugs on the bottom shelf.
+A stack of 3 plates. A blue plate is on the top, sitting on a blue plate. The blue plate is in the middle, sitting on a green plate. The green plate is on the bottom.
+In late afternoon in January in New England, a man stands in the shadow of a maple tree.
+Hovering cow abducting aliens.
+A triangular pink stop sign. A pink stop sign in the shape of a triangle.
+A photocopy of a photograph of a painting of a sculpture of a giraffe.
+A separate seat for one person, typically with a back and four legs.
+A horse riding an astronaut.
+Three cats and three dogs sitting on the grass.
+A bird scaring a scarecrow.
+Tcennis rpacket.
+One car on the street.
+A mechanical or electrical device for measuring time.
+New York Skyline with 'NeurIPS' written with fireworks on the sky.
+A fish eating a pelican.
+A black apple and a green backpack.
+A cube made of denim. A cube with the texture of denim.
+A storefront with 'Deep Learning' written on it.
+New York Skyline with 'Deep Learning' written with fireworks on the sky.
+A brown colored giraffe.
+A bird scaring a scarecrow.
+A blue colored dog.
+An emoji of a baby panda wearing a red hat, green gloves, red shirt, and green pants.
+A green cup and a blue cell phone.
+A carrot on the left of a broccoli.
+A green apple and a black backpack.
+A yellow book and a red vase.
+A triangular purple flower pot. A purple flower pot in the shape of a triangle.
+A small vessel propelled on water by oars, sails, or an engine.
+An orange colored sandwich.
+A tomato has been put on top of a pumpkin on a kitchen stool. There is a fork sticking into the pumpkin. The scene is viewed from above.
+Rbefraigerator.
+A machine next to a parking space in a street, into which the driver puts money so as to be authorized to park the vehicle for a particular length of time.
+A hair drier underneath a sheep.
+A grocery store refrigerator has pint cartons of milk on the top shelf, quart cartons on the middle shelf, and gallon plastic jugs on the bottom shelf.
+A sign that says 'Deep Learning'.
+A cross-section view of a brain.
+A black colored car.
+Two cars on the street.
+Photo of an athlete cat explaining it's latest scandal at a press conference to journalists.
+Rainbow coloured penguin.
+A black apple and a green backpack.
+Darth Vader playing with raccoon in Mars during sunset.
+A spider with a moustache bidding an equally gentlemanly grasshopper a good day during his walk to work.
+One cat and three dogs sitting on the grass.
+35mm macro shot a kitten licking a baby duck, studio lighting.
+An umbrella on top of a spoon.
+Bzaseball galove.
+Greek statue of a man tripping over a cat.
+Supreme Court Justices play a baseball game with the FBI. The FBI is at bat, the justices are on the field.
+An instqrumemnt used for cutting cloth, paper, axdz othr thdin mteroial, consamistng of two blades lad one on tvopb of the other and fhastned in tle mixdqdjle so as to bllow them txo be pened and closed by thumb and fitngesr inserted tgrough rings on kthe end oc thei vatndlzes.
+A car on the left of a bus.
+One dog on the street.
+A church with stained glass windows depicting a hamburger and french fries.
+A vehicle composed of two wheels held in a frame one behind the other, propelled by pedals and steered with handlebars attached to the front wheel.
+A cross-section view of a brain.
+A donut underneath a toilet.
+A small blue book sitting on a large red book.
+A smafml vessef epropoeilled on watvewr by ors, sauls, or han engie.
+A sign that says 'Deep Learning'.
+Photo of a cat singing in a barbershop quartet.
+A cube made of brick. A cube with the texture of brick.
+An oil painting of a couple in formal evening wear going home get caught in a heavy downpour with no umbrellas.
+One car on the street.
+A mechanical or electrical device for measuring time.
+Hyper-realistic photo of an abandoned industrial site during a storm.
+A giraffe underneath a microwave.
+New York Skyline with 'Google Brain Toronto' written with fireworks on the sky.
+An ancient Egyptian painting depicting an argument over whose turn it is to take out the trash.
+A red book and a yellow vase.
+A yellow colored giraffe.
+A smafml vessef epropoeilled on watvewr by ors, sauls, or han engie.
+A long curved fruit which grows in clusters and has soft pulpy flesh and yellow skin when ripe.
+New York Skyline with 'Hello World' written with fireworks on the sky.
+Two cats and two dogs sitting on the grass.
+Photo of a cat singing in a barbershop quartet.
+Colouring page of large cats climbing the eifel tower in a cyberpunk future.
+Abraham Lincoln touches his toes while George Washington does chin-ups. Lincoln is barefoot. Washington is wearing boots.
+A medieval painting of the wifi not working.
+A car playing soccer, digital art.
+A black colored car.
+An orange colored sandwich.
+A ldarge keybord msical instroument lwith a woden case enmclosig a qsouvnkboajrd and mfgtal strivgf, which are strucrk b hammrs when the nels are depresdsmed.f lhe strsingsj' vibration ie stopped by damperds when the keys re released and can bce regulavewdd for lengh and vnolume y two or three pedalvs.
+An instrument used for cutting cloth, paper, and other thin material, consisting of two blades laid one on top of the other and fastened in the middle so as to allow them to be opened and closed by a thumb and finger inserted through rings on the end of their handles.
+Four cars on the street.
+A small domesticated carnivorous mammal with soft fur, a short snout, and retractable claws. It is widely kept as a pet or for catching mice, and many breeds have been developed.
+A donkey and an octopus are playing a game. The donkey is holding a rope on one end, the octopus is holding onto the other. The donkey holds the rope in its mouth. A cat is jumping over the rope.
+An illustration of a large red elephant sitting on a small blue mouse.
+Octothorpe.
+A fisheye lens view of a turtle sitting in a forest.
+New York Skyline with 'Text to Image' written with fireworks on the sky.
+A storefront with 'Deep Learning' written on it.
+A spider with a moustache bidding an equally gentlemanly grasshopper a good day during his walk to work.
+An oil painting of a couple in formal evening wear going home get caught in a heavy downpour with no umbrellas.
+An emoji of a baby panda wearing a red hat, blue gloves, green shirt, and blue pants.
+An IT-guy trying to fix hardware of a PC tower is being tangled by the PC cables like Laokoon. Marble, copy after Hellenistic original from ca. 200 BC. Found in the Baths of Trajan, 1506.
+A sheep to the right of a wine glass.
+A cube made of denim. A cube with the texture of denim.
+Painting of the orange cat Otto von Garfield, Count of Bismarck-Schönhausen, Duke of Lauenburg, Minister-President of Prussia. Depicted wearing a Prussian Pickelhaube and eating his favorite meal - lasagna.
+A sign that says 'Google Research Pizza Cafe'.
+A ldarge keybord msical instroument lwith a woden case enmclosig a qsouvnkboajrd and mfgtal strivgf, which are strucrk b hammrs when the nels are depresdsmed.f lhe strsingsj' vibration ie stopped by damperds when the keys re released and can bce regulavewdd for lengh and vnolume y two or three pedalvs.
+35mm macro shot a kitten licking a baby duck, studio lighting.
+A shark in the desert.
+A green colored banana.
+A green cup and a blue cell phone.
+Backlotter.
+Darth Vader playing with raccoon in Mars during sunset.
+A green apple and a black backpack.
+A tomato has been put on top of a pumpkin on a kitchen stool. There is a fork sticking into the pumpkin. The scene is viewed from above.
+A red colored dog.
+A red book and a yellow vase.
+Rbefraigerator.
+A train on top of a surfboard.
+Dininrg tablez.
+A separate seat for one person, typically with a back and four legs.
+A domesticated carnivvorous mzammal that typicbally hfaas a lons sfnout, an acxujte sense off osmell, noneetractaaln crlaws, anid xbarkring,y howlingu, or whining rvoiche.
+A black colored dog.
+A pink colored giraffe.
+New York Skyline with 'Google Brain Toronto' written with fireworks on the sky.
+Supreme Court Justices play a baseball game with the FBI. The FBI is at bat, the justices are on the field.
+A white car and a red sheep.
+An organ of soft nervous tissue contained in the skull of vertebrates, functioning as the coordinating center of sensation and intellectual and nervous activity.
+Tcennis rpacket.
+A red book and a yellow vase.
+A cross-section view of a brain.
+An illustration of a small green elephant standing behind a large red mouse.
+One dog on the street.
+A zebra underneath a broccoli.
+A zebra to the right of a fire hydrant.
+A large plant-eating domesticated mammal with solid hoofs and a flowing mane and tail, used for riding, racing, and to carry and pull loads.
+An elephant under the sea.
+An elephant under the sea.
+A pizza on the right of a suitcase.
+Greek statue of a man tripping over a cat.
+A couple of glasses are sitting on a table.
+A storefront with 'Diffusion' written on it.
+A sheep to the right of a wine glass.
+A fisheye lens view of a turtle sitting in a forest.
+A vehicle composed of two wheels held in a frame one behind the other, propelled by pedals and steered with handlebars attached to the front wheel.
+A 1960s poster warning against climate change.
+Three cars on the street.
+An umbrella on top of a spoon.
+A zebra underneath a broccoli.
+A black colored dog.
+A small domesticated carnivorous mammal with soft fur, a short snout, and retractable claws. It is widely kept as a pet or for catching mice, and many breeds have been developed.
+A sign that says 'Google Brain Toronto'.
+A large plant-eating domesticated mammal with solid hoofs and a flowing mane and tail, used for riding, racing, and to carry and pull loads.
+A sign that says 'NeurIPS'.
+Pafrking metr.
+A sign that says 'Text to Image'.
+A screenshot of an iOS app for ordering different types of milk.
+A large motor vehicle carrying passengers by road, typically one serving the public on a fixed route and for a fare.
+One cat and two dogs sitting on the grass.
+A cube made of brick. A cube with the texture of brick.
+A storefront with 'Text to Image' written on it.
+A screenshot of an iOS app for ordering different types of milk.
+Two dogs on the street.
+Dininrg tablez.
+A baby fennec sneezing onto a strawberry, detailed, macro, studio light, droplets, backlit ears.
+A cat on the left of a dog.
+A machine resembling a human being and able to replicate certain human movements and functions automatically.
+A panda making latte art.
+A storefront with 'Hello World' written on it.
+New York Skyline with 'Diffusion' written with fireworks on the sky.
+Two cats and three dogs sitting on the grass.
+McDonalds Church.
+A cat on the left of a dog.
+Octothorpe.
+Painting of Mona Lisa but the view is from behind of Mona Lisa.
+A smafml vessef epropoeilled on watvewr by ors, sauls, or han engie.
+A maglev train going vertically downward in high speed, New York Times photojournalism.
+Three dogs on the street.
+A mechanical or electrical device for measuring time.
+A pear cut into seven pieces arranged in a ring.
+Lego Arnold Schwarzenegger.
+An appliance or compartment which is artificially kept cool and used to store food and drink.
+A black colored car.
+An oil painting portrait of the regal Burger King posing with a Whopper.
+A black colored banana.
+Three cats and three dogs sitting on the grass.
+A domesticated carnivorous mammal that typically has a long snout, an acute sense of smell, nonretractable claws, and a barking, howling, or whining voice.
+A wine glass on top of a dog.
+A tomato has been put on top of a pumpkin on a kitchen stool. There is a fork sticking into the pumpkin. The scene is viewed from above.
+Backlotter.
+A bird scaring a scarecrow.
+A single clock is sitting on a table.
+Bzaseball galove.
+A yellow colored giraffe.
+A white colored sandwich.
+A giraffe underneath a microwave.
+A couch on the left of a chair.
+A pizza on the right of a suitcase.
+Lego Arnold Schwarzenegger.
+A donut underneath a toilet.
+A triangular orange picture frame. An orange picture frame in the shape of a triangle.
+McDonalds Church.
+35mm macro shot a kitten licking a baby duck, studio lighting.
+A machine resembling a human being and able to replicate certain human movements and functions automatically.
+An elephant is behind a tree. You can see the trunk on one side and the back legs on the other.
+A spider with a moustache bidding an equally gentlemanly grasshopper a good day during his walk to work.
+An umbrella on top of a spoon.
+Lego Arnold Schwarzenegger.
+A yellow and black bus cruising through the rainforest.
+A giraffe underneath a microwave.
+A cube made of denim. A cube with the texture of denim.
+A sheep to the right of a wine glass.
+A bridge connecting Europe and North America on the Atlantic Ocean, bird's eye view.
+A 1960s yearbook photo with animals dressed as humans.
+Paying for a quarter-sized pizza with a pizza-sized quarter.
+A black colored sandwich.
+A large keyboard musical instrument with a wooden case enclosing a soundboard and metal strings, which are struck by hammers when the keys are depressed. The strings' vibration is stopped by dampers when the keys are released and can be regulated for length and volume by two or three pedals.
+A spider with a moustache bidding an equally gentlemanly grasshopper a good day during his walk to work.
+One car on the street.
+A carrot on the left of a broccoli.
+Two cats and three dogs sitting on the grass.
+A stack of 3 books. A green book is on the top, sitting on a red book. The red book is in the middle, sitting on a blue book. The blue book is on the bottom.
+Two cats and one dog sitting on the grass.
+An instqrumemnt used for cutting cloth, paper, axdz othr thdin mteroial, consamistng of two blades lad one on tvopb of the other and fhastned in tle mixdqdjle so as to bllow them txo be pened and closed by thumb and fitngesr inserted tgrough rings on kthe end oc thei vatndlzes.
+Dininrg tablez.
+A connection point by which firefighters can tap into a water supply.
+Four dogs on the street.
+A sign that says 'Hello World'.
+Photo of a mega Lego space station inside a kid's bedroom.
+McDonalds Church.
+Illustration of a mouse using a mushroom as an umbrella.
+A magnifying glass over a page of a 1950s batman comic.
+Hyper-realistic photo of an abandoned industrial site during a storm.
+A magnifying glass over a page of a 1950s batman comic.
+An umbrella on top of a spoon.
+A fluffy baby sloth with a knitted hat trying to figure out a laptop, close up, highly detailed, studio lighting, screen reflecting in its eyes.
+A large keyboard musical instrument with a wooden case enclosing a soundboard and metal strings, which are struck by hammers when the keys are depressed. The strings' vibration is stopped by dampers when the keys are released and can be regulated for length and volume by two or three pedals.
+A red colored dog.
+A red colored car.
+A black colored car.
+Five cars on the street.
+A baby fennec sneezing onto a strawberry, detailed, macro, studio light, droplets, backlit ears.
+In late afternoon in January in New England, a man stands in the shadow of a maple tree.
+Photo of a cat singing in a barbershop quartet.
+Hovering cow abducting aliens.
+An old photograph of a 1920s airship shaped like a pig, floating over a wheat field.
+An organ of soft nervous tissue contained in the skull of vertebrates, functioning as the coordinating center of sensation and intellectual and nervous activity.
+A triangular purple flower pot. A purple flower pot in the shape of a triangle.
+A pear cut into seven pieces arranged in a ring.
+A red colored car.
+Two cats and one dog sitting on the grass.
+A cube made of brick. A cube with the texture of brick.
+A pyramid made of falafel with a partial solar eclipse in the background.
+A yellow colored giraffe.
+An instrument used for cutting cloth, paper, and other thin material, consisting of two blades laid one on top of the other and fastened in the middle so as to allow them to be opened and closed by a thumb and finger inserted through rings on the end of their handles.
+A sjmall domesticated carnivorious mammnal with sof fuh,y a sthort sout, and retracwtablbe flaws. It iw widexly kept as a pet or for catchitng mic, ad many breeds zhlyde beefn develvoked.
+A couch on the left of a chair.
+A photocopy of a photograph of a painting of a sculpture of a giraffe.
+A sign that says 'Google Brain Toronto'.
+A sign that says 'Text to Image'.
+Rainbow coloured penguin.
+Two dogs on the street.
+A triangular orange picture frame. An orange picture frame in the shape of a triangle.
+Colouring page of large cats climbing the eifel tower in a cyberpunk future.
+A white colored sandwich.
+A stop sign on the right of a refrigerator.
+A long curved fruit which grows in clusters and has soft pulpy flesh and yellow skin when ripe.
+Three cats and two dogs sitting on the grass.
+A hair drier underneath a sheep.
+A train on top of a surfboard.
+A large plant-eating domesticated mammal with solid hoofs and a flowing mane and tail, used for riding, racing, and to carry and pull loads.
+A sign that says 'Google Research Pizza Cafe'.
+A stop sign on the right of a refrigerator.
+A tiger in a lab coat with a 1980s Miami vibe, turning a well oiled science content machine, digital art.
+A device consisting of a circular canopy of cloth on a folding metal frame supported by a central rod, used as protection against rain or sometimes sun.
+An illustration of a small green elephant standing behind a large red mouse.
+A sign that says 'Hello World'.
+Lego Arnold Schwarzenegger.
+Five dogs on the street.
+A storefront with 'Hello World' written on it.
+An instrument used for cutting cloth, paper, and other thin material, consisting of two blades laid one on top of the other and fastened in the middle so as to allow them to be opened and closed by a thumb and finger inserted through rings on the end of their handles.
+A tomato has been put on top of a pumpkin on a kitchen stool. There is a fork sticking into the pumpkin. The scene is viewed from above.
+In late afternoon in January in New England, a man stands in the shadow of a maple tree.
+Jentacular.
+Four dogs on the street.
+An old photograph of a 1920s airship shaped like a pig, floating over a wheat field.
+A triangular purple flower pot. A purple flower pot in the shape of a triangle.
+A black colored banana.
+35mm macro shot a kitten licking a baby duck, studio lighting.
+Bzaseball galove.
+A fisheye lens view of a turtle sitting in a forest.
+A donut underneath a toilet.
+A donkey and an octopus are playing a game. The donkey is holding a rope on one end, the octopus is holding onto the other. The donkey holds the rope in its mouth. A cat is jumping over the rope.
+A fluffy baby sloth with a knitted hat trying to figure out a laptop, close up, highly detailed, studio lighting, screen reflecting in its eyes.
+A green apple and a black backpack.
+An illustration of a large red elephant sitting on a small blue mouse.
+A smafml vessef epropoeilled on watvewr by ors, sauls, or han engie.
+A machine next to a parking space in a street, into which the driver puts money so as to be authorized to park the vehicle for a particular length of time.
+Rainbow coloured penguin.
+Three cats and one dog sitting on the grass.
+An old photograph of a 1920s airship shaped like a pig, floating over a wheat field.
+New York Skyline with 'Google Research Pizza Cafe' written with fireworks on the sky.
+A painting by Grant Wood of an astronaut couple, american gothic style.
+Four dogs on the street.
+A large motor vehicle carrying passengers by road, typically one serving the public on a fixed route and for a fare.
+A couple of glasses are sitting on a table.
+In late afternoon in January in New England, a man stands in the shadow of a maple tree.
+A brown colored giraffe.
+A bridge connecting Europe and North America on the Atlantic Ocean, bird's eye view.
+Five dogs on the street.
+New York Skyline with 'Text to Image' written with fireworks on the sky.
+An appliance or compartment which is artificially kept cool and used to store food and drink.
+A real life photography of super mario, 8k Ultra HD.
+A pink colored car.
+A painting by Grant Wood of an astronaut couple, american gothic style.
+A car on the left of a bus.
+A large plant-eating domesticated mammal with solid hoofs and a flowing mane and tail, used for riding, racing, and to carry and pull loads.
+Pafrking metr.
+An illustration of a small green elephant standing behind a large red mouse.
+A blue cup and a green cell phone.
+New York Skyline with 'NeurIPS' written with fireworks on the sky.
+A storefront with 'Google Brain Toronto' written on it.
+A painting by Grant Wood of an astronaut couple, american gothic style.
+A black colored sandwich.
+A fish eating a pelican.
+An emoji of a baby panda wearing a red hat, blue gloves, green shirt, and blue pants.
+A vehicle composed of two wheels held in a frame one behind the other, propelled by pedals and steered with handlebars attached to the front wheel.
+A tennis racket underneath a traffic light.
+Three cars on the street.
+One car on the street.
+A tennis racket underneath a traffic light.
+A maglev train going vertically downward in high speed, New York Times photojournalism.
+Photo of an athlete cat explaining it's latest scandal at a press conference to journalists.
+A red book and a yellow vase.
+A shark in the desert.
+An organ of soft nervous tissue contained in the skull of vertebrates, functioning as the coordinating center of sensation and intellectual and nervous activity.
+A sign that says 'Text to Image'.
+A stack of 3 books. A green book is on the top, sitting on a red book. The red book is in the middle, sitting on a blue book. The blue book is on the bottom.
+A shark in the desert.
+A 1960s poster warning against climate change.
+Backlotter.
+One cat and two dogs sitting on the grass.
+Matutinal.
+A cat on the right of a tennis racket.
+A laptop on top of a teddy bear.
+A white colored sandwich.
+A yellow and black bus cruising through the rainforest.
+A photocopy of a photograph of a painting of a sculpture of a giraffe.
+A side view of an owl sitting in a field.
+A pizza on the right of a suitcase.
+A wine glass on top of a dog.
+A realistic photo of a Pomeranian dressed up like a 1980s professional wrestler with neon green and neon orange face paint and bright green wrestling tights with bright orange boots.
+A pear cut into seven pieces arranged in a ring.
+Acersecomicke.
+Painting of Mona Lisa but the view is from behind of Mona Lisa.
+A small vessel propelled on water by oars, sails, or an engine.
+Painting of the orange cat Otto von Garfield, Count of Bismarck-Schönhausen, Duke of Lauenburg, Minister-President of Prussia. Depicted wearing a Prussian Pickelhaube and eating his favorite meal - lasagna.
+A cat on the left of a dog.
+A red colored banana.
+A domesticated carnivorous mammal that typically has a long snout, an acute sense of smell, nonretractable claws, and a barking, howling, or whining voice.
+A sign that says 'Google Brain Toronto'.
+A collection of nail is sitting on a table.
+A pyramid made of falafel with a partial solar eclipse in the background.
+A realistic photo of a Pomeranian dressed up like a 1980s professional wrestler with neon green and neon orange face paint and bright green wrestling tights with bright orange boots.
+A cube made of brick. A cube with the texture of brick.
+New York Skyline with 'Text to Image' written with fireworks on the sky.
+A fish eating a pelican.
+A pink colored giraffe.
+One cat and three dogs sitting on the grass.
+A keyboard made of water, the water is made of light, the light is turned off.
+Greek statue of a man tripping over a cat.
+A machine resembling a human being and able to replicate certain human movements and functions automatically.
+A yellow and black bus cruising through the rainforest.
+An elephant is behind a tree. You can see the trunk on one side and the back legs on the other.
+A small domesticated carnivorous mammal with soft fur, a short snout, and retractable claws. It is widely kept as a pet or for catching mice, and many breeds have been developed.
+Dininrg tablez.
+A sign that says 'NeurIPS'.
+An illustration of a small green elephant standing behind a large red mouse.
+A collection of nail is sitting on a table.
+An oil painting of a couple in formal evening wear going home get caught in a heavy downpour with no umbrellas.
+New York Skyline with 'Hello World' written with fireworks on the sky.
+A storefront with 'Text to Image' written on it.
+A storefront with 'Deep Learning' written on it.
+Three cats and two dogs sitting on the grass.
+A red car and a white sheep.
+A domesticated carnivvorous mzammal that typicbally hfaas a lons sfnout, an acxujte sense off osmell, noneetractaaln crlaws, anid xbarkring,y howlingu, or whining rvoiche.
+A mechanical or electrical device for measuring time.
+A bridge connecting Europe and North America on the Atlantic Ocean, bird's eye view.
+An appliance or compartment which is artificially kept cool and used to store food and drink.
+A pizza cooking an oven.
+A car playing soccer, digital art.
+A blue coloured pizza.
+A machine next to a parking space in a street, into which the driver puts money so as to be authorized to park the vehicle for a particular length of time.
+Octothorpe.
+A yellow book and a red vase.
+A bicycle on top of a boat.
+A device consisting of a circular canopy of cloth on a folding metal frame supported by a central rod, used as protection against rain or sometimes sun.
+An orange colored sandwich.
+Acersecomicke.
+A magnifying glass over a page of a 1950s batman comic.
+A black apple and a green backpack.
+A bird scaring a scarecrow.
+A sign that says 'Deep Learning'.
+A bicycle on top of a boat.
+Painting of Mona Lisa but the view is from behind of Mona Lisa.
+Three dogs on the street.
+A stack of 3 plates. A blue plate is on the top, sitting on a blue plate. The blue plate is in the middle, sitting on a green plate. The green plate is on the bottom.
+A red car and a white sheep.
+Greek statue of a man tripping over a cat.
+Three dogs on the street.
+A sheep to the right of a wine glass.
+One cat and one dog sitting on the grass.
+A black colored sandwich.
+Peristeronic.
+Three cats and two dogs sitting on the grass.
+A 1960s yearbook photo with animals dressed as humans.
+A sign that says 'Diffusion'.
+A sign that says 'Google Research Pizza Cafe'.
+A blue bird and a brown bear.
+A yellow and black bus cruising through the rainforest.
+A smafml vessef epropoeilled on watvewr by ors, sauls, or han engie.
+Bzaseball galove.
+Artophagous.
+A sign that says 'Text to Image'.
+A baby fennec sneezing onto a strawberry, detailed, macro, studio light, droplets, backlit ears.
+A fisheye lens view of a turtle sitting in a forest.
+A storefront with 'Hello World' written on it.
+A connection point by which firefighters can tap into a water supply.
+A separate seat for one person, typically with a back and four legs.
+A 1960s yearbook photo with animals dressed as humans.
+A sphere made of kitchen tile. A sphere with the texture of kitchen tile.
+A black colored banana.
+A vehicle composed of two wheels held in a frame one behind the other, propelled by pedals and steered with handlebars attached to the front wheel.
+Four cars on the street.
+Three cats and three dogs sitting on the grass.
+Five dogs on the street.
+An ancient Egyptian painting depicting an argument over whose turn it is to take out the trash.
+A storefront with 'Diffusion' written on it.
+A pizza cooking an oven.
+Darth Vader playing with raccoon in Mars during sunset.
+A carrot on the left of a broccoli.
+A tomato has been put on top of a pumpkin on a kitchen stool. There is a fork sticking into the pumpkin. The scene is viewed from above.
+A storefront with 'Diffusion' written on it.
+A red book and a yellow vase.
+Peristeronic.
+An organ of soft nervous tissue contained in the skull of vertebrates, functioning as the coordinating center of sensation and intellectual and nervous activity.
+A donkey and an octopus are playing a game. The donkey is holding a rope on one end, the octopus is holding onto the other. The donkey holds the rope in its mouth. A cat is jumping over the rope.
+A couch on the left of a chair.
+A sphere made of kitchen tile. A sphere with the texture of kitchen tile.
+A white car and a red sheep.
+Artophagous.
+A stack of 3 books. A green book is on the top, sitting on a red book. The red book is in the middle, sitting on a blue book. The blue book is on the bottom.
+A pizza cooking an oven.
+A triangular purple flower pot. A purple flower pot in the shape of a triangle.
+A brown bird and a blue bear.
+An IT-guy trying to fix hardware of a PC tower is being tangled by the PC cables like Laokoon. Marble, copy after Hellenistic original from ca. 200 BC. Found in the Baths of Trajan, 1506.
+A storefront with 'Google Research Pizza Cafe' written on it.
+A storefront with 'Google Research Pizza Cafe' written on it.
+A large thick-skinned semiaquatic African mammal, with massive jaws and large tusks.
+An appliance or compartment which is artificially kept cool and used to store food and drink.
+A donut underneath a toilet.
+A blue bird and a brown bear.
+A 1960s poster warning against climate change.
+A white colored sandwich.
+A white colored sandwich.
+A stop sign on the right of a refrigerator.
+A storefront with 'Hello World' written on it.
+Five dogs on the street.
+Three cars on the street.
+A keyboard made of water, the water is made of light, the light is turned off.
+A red colored dog.
+Two cats and three dogs sitting on the grass.
+A spider with a moustache bidding an equally gentlemanly grasshopper a good day during his walk to work.
+A pink colored car.
+A tiger in a lab coat with a 1980s Miami vibe, turning a well oiled science content machine, digital art.
+Photo of an athlete cat explaining it's latest scandal at a press conference to journalists.
+A realistic photo of a Pomeranian dressed up like a 1980s professional wrestler with neon green and neon orange face paint and bright green wrestling tights with bright orange boots.
+A type of digital currency in which a record of transactions is maintained and new units of currency are generated by the computational solution of mathematical problems, and which operates independently of a central bank.
+A sign that says 'Hello World'.
+An ancient Egyptian painting depicting an argument over whose turn it is to take out the trash.
+A white car and a red sheep.
+Illustration of a mouse using a mushroom as an umbrella.
+A red colored banana.
+Three cats and one dog sitting on the grass.
+A car playing soccer, digital art.
+A sjmall domesticated carnivorious mammnal with sof fuh,y a sthort sout, and retracwtablbe flaws. It iw widexly kept as a pet or for catchitng mic, ad many breeds zhlyde beefn develvoked.
+Rbefraigerator.
+A triangular orange picture frame. An orange picture frame in the shape of a triangle.
+Rainbow coloured penguin.
+A storefront with 'Text to Image' written on it.
+A cat on the right of a tennis racket.
+A small blue book sitting on a large red book.
+Two cats and one dog sitting on the grass.
+An emoji of a baby panda wearing a red hat, green gloves, red shirt, and green pants.
+A brown bird and a blue bear.
+A red car and a white sheep.
+A pizza on the right of a suitcase.
+A small blue book sitting on a large red book.
+A horse riding an astronaut.
+A sign that says 'Google Brain Toronto'.
+Hyper-realistic photo of an abandoned industrial site during a storm.
+A side view of an owl sitting in a field.
+A photo of a confused grizzly bear in calculus class.
+An American multinational technology company that focuses on artificial intelligence, search engine, online advertising, cloud computing, computer software, quantum computing, e-commerce, and consumer electronics.
+A storefront with 'NeurIPS' written on it.
+A storefront with 'NeurIPS' written on it.
+Two cats and one dog sitting on the grass.
+New York Skyline with 'Diffusion' written with fireworks on the sky.
+A storefront with 'Diffusion' written on it.
+A blue coloured pizza.
+A single clock is sitting on a table.
+A zebra to the right of a fire hydrant.
+Backlotter.
+An ancient Egyptian painting depicting an argument over whose turn it is to take out the trash.
+Two cats and two dogs sitting on the grass.
+Painting of Mona Lisa but the view is from behind of Mona Lisa.
+A triangular orange picture frame. An orange picture frame in the shape of a triangle.
+A bird scaring a scarecrow.
+A keyboard made of water, the water is made of light, the light is turned off.
+A tennis racket underneath a traffic light.
+A banana on the left of an apple.
+A screenshot of an iOS app for ordering different types of milk.
+A long curved fruit which grows in clusters and has soft pulpy flesh and yellow skin when ripe.
+A side view of an owl sitting in a field.
+Two cats and two dogs sitting on the grass.
+Hovering cow abducting aliens.
+A red car and a white sheep.
+A zebra underneath a broccoli.
+Rainbow coloured penguin.
+A storefront with 'Deep Learning' written on it.
+Three cars on the street.
+A red colored banana.
+A blue bird and a brown bear.
+New York Skyline with 'NeurIPS' written with fireworks on the sky.
+A sjmall domesticated carnivorious mammnal with sof fuh,y a sthort sout, and retracwtablbe flaws. It iw widexly kept as a pet or for catchitng mic, ad many breeds zhlyde beefn develvoked.
+A giraffe underneath a microwave.
+A brown colored giraffe.
+An instqrumemnt used for cutting cloth, paper, axdz othr thdin mteroial, consamistng of two blades lad one on tvopb of the other and fhastned in tle mixdqdjle so as to bllow them txo be pened and closed by thumb and fitngesr inserted tgrough rings on kthe end oc thei vatndlzes.
+A pizza cooking an oven.
+A bicycle on top of a boat.
+A screenshot of an iOS app for ordering different types of milk.
+A car playing soccer, digital art.
+A banana on the left of an apple.
+A cube made of brick. A cube with the texture of brick.
+A sheep to the right of a wine glass.
+A type of digital currency in which a record of transactions is maintained and new units of currency are generated by the computational solution of mathematical problems, and which operates independently of a central bank.
+A medieval painting of the wifi not working.
+A brown bird and a blue bear.
+A yellow and black bus cruising through the rainforest.
+A bridge connecting Europe and North America on the Atlantic Ocean, bird's eye view.
+Hyper-realistic photo of an abandoned industrial site during a storm.
+Photo of an athlete cat explaining it's latest scandal at a press conference to journalists.
+A stack of 3 cubes. A red cube is on the top, sitting on a red cube. The red cube is in the middle, sitting on a green cube. The green cube is on the bottom.
+A yellow book and a red vase.
+A wine glass on top of a dog.
+A sign that says 'Deep Learning'.
+A small domesticated carnivorous mammal with soft fur, a short snout, and retractable claws. It is widely kept as a pet or for catching mice, and many breeds have been developed.
+Jentacular.
+A car on the left of a bus.
+A machine resembling a human being and able to replicate certain human movements and functions automatically.
+New York Skyline with 'Google Research Pizza Cafe' written with fireworks on the sky.
+Photo of a mega Lego space station inside a kid's bedroom.
+Peristeronic.
+One cat and one dog sitting on the grass.
+A horse riding an astronaut.
+New York Skyline with 'Deep Learning' written with fireworks on the sky.
+A zebra underneath a broccoli.
+A machine resembling a human being and able to replicate certain human movements and functions automatically.
+A red colored dog.
+Acersecomicke.
+One dog on the street.
+A white car and a red sheep.
+New York Skyline with 'NeurIPS' written with fireworks on the sky.
+A single clock is sitting on a table.
+A zebra to the right of a fire hydrant.
+A triangular orange picture frame. An orange picture frame in the shape of a triangle.
+A blue colored dog.
+McDonalds Church.
+Tcennis rpacket.
+A brown colored giraffe.
+Hyper-realistic photo of an abandoned industrial site during a storm.
+Tcennis rpacket.
+A church with stained glass windows depicting a hamburger and french fries.
+A bicycle on top of a boat.
+A banana on the left of an apple.
+A connection point by which firefighters can tap into a water supply.
+New York Skyline with 'Deep Learning' written with fireworks on the sky.
+An instqrumemnt used for cutting cloth, paper, axdz othr thdin mteroial, consamistng of two blades lad one on tvopb of the other and fhastned in tle mixdqdjle so as to bllow them txo be pened and closed by thumb and fitngesr inserted tgrough rings on kthe end oc thei vatndlzes.
+New York Skyline with 'NeurIPS' written with fireworks on the sky.
+Paying for a quarter-sized pizza with a pizza-sized quarter.

prompts/evaluation_metadata.jsonl ADDED Viewed

	@@ -0,0 +1,553 @@

+{"tag": "single_object", "include": [{"class": "bench", "count": 1}], "prompt": "a photo of a bench"}
+{"tag": "single_object", "include": [{"class": "cow", "count": 1}], "prompt": "a photo of a cow"}
+{"tag": "single_object", "include": [{"class": "bicycle", "count": 1}], "prompt": "a photo of a bicycle"}
+{"tag": "single_object", "include": [{"class": "clock", "count": 1}], "prompt": "a photo of a clock"}
+{"tag": "single_object", "include": [{"class": "carrot", "count": 1}], "prompt": "a photo of a carrot"}
+{"tag": "single_object", "include": [{"class": "suitcase", "count": 1}], "prompt": "a photo of a suitcase"}
+{"tag": "single_object", "include": [{"class": "fork", "count": 1}], "prompt": "a photo of a fork"}
+{"tag": "single_object", "include": [{"class": "surfboard", "count": 1}], "prompt": "a photo of a surfboard"}
+{"tag": "single_object", "include": [{"class": "refrigerator", "count": 1}], "prompt": "a photo of a refrigerator"}
+{"tag": "single_object", "include": [{"class": "cup", "count": 1}], "prompt": "a photo of a cup"}
+{"tag": "single_object", "include": [{"class": "microwave", "count": 1}], "prompt": "a photo of a microwave"}
+{"tag": "single_object", "include": [{"class": "potted plant", "count": 1}], "prompt": "a photo of a potted plant"}
+{"tag": "single_object", "include": [{"class": "snowboard", "count": 1}], "prompt": "a photo of a snowboard"}
+{"tag": "single_object", "include": [{"class": "zebra", "count": 1}], "prompt": "a photo of a zebra"}
+{"tag": "single_object", "include": [{"class": "parking meter", "count": 1}], "prompt": "a photo of a parking meter"}
+{"tag": "single_object", "include": [{"class": "spoon", "count": 1}], "prompt": "a photo of a spoon"}
+{"tag": "single_object", "include": [{"class": "skateboard", "count": 1}], "prompt": "a photo of a skateboard"}
+{"tag": "single_object", "include": [{"class": "car", "count": 1}], "prompt": "a photo of a car"}
+{"tag": "single_object", "include": [{"class": "motorcycle", "count": 1}], "prompt": "a photo of a motorcycle"}
+{"tag": "single_object", "include": [{"class": "traffic light", "count": 1}], "prompt": "a photo of a traffic light"}
+{"tag": "single_object", "include": [{"class": "book", "count": 1}], "prompt": "a photo of a book"}
+{"tag": "single_object", "include": [{"class": "couch", "count": 1}], "prompt": "a photo of a couch"}
+{"tag": "single_object", "include": [{"class": "backpack", "count": 1}], "prompt": "a photo of a backpack"}
+{"tag": "single_object", "include": [{"class": "computer keyboard", "count": 1}], "prompt": "a photo of a computer keyboard"}
+{"tag": "single_object", "include": [{"class": "toaster", "count": 1}], "prompt": "a photo of a toaster"}
+{"tag": "single_object", "include": [{"class": "bird", "count": 1}], "prompt": "a photo of a bird"}
+{"tag": "single_object", "include": [{"class": "bowl", "count": 1}], "prompt": "a photo of a bowl"}
+{"tag": "single_object", "include": [{"class": "dog", "count": 1}], "prompt": "a photo of a dog"}
+{"tag": "single_object", "include": [{"class": "tie", "count": 1}], "prompt": "a photo of a tie"}
+{"tag": "single_object", "include": [{"class": "laptop", "count": 1}], "prompt": "a photo of a laptop"}
+{"tag": "single_object", "include": [{"class": "computer mouse", "count": 1}], "prompt": "a photo of a computer mouse"}
+{"tag": "single_object", "include": [{"class": "sandwich", "count": 1}], "prompt": "a photo of a sandwich"}
+{"tag": "single_object", "include": [{"class": "baseball bat", "count": 1}], "prompt": "a photo of a baseball bat"}
+{"tag": "single_object", "include": [{"class": "train", "count": 1}], "prompt": "a photo of a train"}
+{"tag": "single_object", "include": [{"class": "cell phone", "count": 1}], "prompt": "a photo of a cell phone"}
+{"tag": "single_object", "include": [{"class": "chair", "count": 1}], "prompt": "a photo of a chair"}
+{"tag": "single_object", "include": [{"class": "tv", "count": 1}], "prompt": "a photo of a tv"}
+{"tag": "single_object", "include": [{"class": "broccoli", "count": 1}], "prompt": "a photo of a broccoli"}
+{"tag": "single_object", "include": [{"class": "bed", "count": 1}], "prompt": "a photo of a bed"}
+{"tag": "single_object", "include": [{"class": "skis", "count": 1}], "prompt": "a photo of a skis"}
+{"tag": "single_object", "include": [{"class": "handbag", "count": 1}], "prompt": "a photo of a handbag"}
+{"tag": "single_object", "include": [{"class": "pizza", "count": 1}], "prompt": "a photo of a pizza"}
+{"tag": "single_object", "include": [{"class": "frisbee", "count": 1}], "prompt": "a photo of a frisbee"}
+{"tag": "single_object", "include": [{"class": "scissors", "count": 1}], "prompt": "a photo of a scissors"}
+{"tag": "single_object", "include": [{"class": "bottle", "count": 1}], "prompt": "a photo of a bottle"}
+{"tag": "single_object", "include": [{"class": "elephant", "count": 1}], "prompt": "a photo of an elephant"}
+{"tag": "single_object", "include": [{"class": "toilet", "count": 1}], "prompt": "a photo of a toilet"}
+{"tag": "single_object", "include": [{"class": "oven", "count": 1}], "prompt": "a photo of an oven"}
+{"tag": "single_object", "include": [{"class": "orange", "count": 1}], "prompt": "a photo of an orange"}
+{"tag": "single_object", "include": [{"class": "person", "count": 1}], "prompt": "a photo of a person"}
+{"tag": "single_object", "include": [{"class": "teddy bear", "count": 1}], "prompt": "a photo of a teddy bear"}
+{"tag": "single_object", "include": [{"class": "vase", "count": 1}], "prompt": "a photo of a vase"}
+{"tag": "single_object", "include": [{"class": "banana", "count": 1}], "prompt": "a photo of a banana"}
+{"tag": "single_object", "include": [{"class": "toothbrush", "count": 1}], "prompt": "a photo of a toothbrush"}
+{"tag": "single_object", "include": [{"class": "tv remote", "count": 1}], "prompt": "a photo of a tv remote"}
+{"tag": "single_object", "include": [{"class": "dining table", "count": 1}], "prompt": "a photo of a dining table"}
+{"tag": "single_object", "include": [{"class": "stop sign", "count": 1}], "prompt": "a photo of a stop sign"}
+{"tag": "single_object", "include": [{"class": "sheep", "count": 1}], "prompt": "a photo of a sheep"}
+{"tag": "single_object", "include": [{"class": "fire hydrant", "count": 1}], "prompt": "a photo of a fire hydrant"}
+{"tag": "single_object", "include": [{"class": "airplane", "count": 1}], "prompt": "a photo of an airplane"}
+{"tag": "single_object", "include": [{"class": "giraffe", "count": 1}], "prompt": "a photo of a giraffe"}
+{"tag": "single_object", "include": [{"class": "horse", "count": 1}], "prompt": "a photo of a horse"}
+{"tag": "single_object", "include": [{"class": "cat", "count": 1}], "prompt": "a photo of a cat"}
+{"tag": "single_object", "include": [{"class": "donut", "count": 1}], "prompt": "a photo of a donut"}
+{"tag": "single_object", "include": [{"class": "boat", "count": 1}], "prompt": "a photo of a boat"}
+{"tag": "single_object", "include": [{"class": "baseball glove", "count": 1}], "prompt": "a photo of a baseball glove"}
+{"tag": "single_object", "include": [{"class": "hair drier", "count": 1}], "prompt": "a photo of a hair drier"}
+{"tag": "single_object", "include": [{"class": "sink", "count": 1}], "prompt": "a photo of a sink"}
+{"tag": "single_object", "include": [{"class": "cake", "count": 1}], "prompt": "a photo of a cake"}
+{"tag": "single_object", "include": [{"class": "wine glass", "count": 1}], "prompt": "a photo of a wine glass"}
+{"tag": "single_object", "include": [{"class": "apple", "count": 1}], "prompt": "a photo of an apple"}
+{"tag": "single_object", "include": [{"class": "bus", "count": 1}], "prompt": "a photo of a bus"}
+{"tag": "single_object", "include": [{"class": "tennis racket", "count": 1}], "prompt": "a photo of a tennis racket"}
+{"tag": "single_object", "include": [{"class": "knife", "count": 1}], "prompt": "a photo of a knife"}
+{"tag": "single_object", "include": [{"class": "hot dog", "count": 1}], "prompt": "a photo of a hot dog"}
+{"tag": "single_object", "include": [{"class": "truck", "count": 1}], "prompt": "a photo of a truck"}
+{"tag": "single_object", "include": [{"class": "umbrella", "count": 1}], "prompt": "a photo of an umbrella"}
+{"tag": "single_object", "include": [{"class": "sports ball", "count": 1}], "prompt": "a photo of a sports ball"}
+{"tag": "single_object", "include": [{"class": "bear", "count": 1}], "prompt": "a photo of a bear"}
+{"tag": "single_object", "include": [{"class": "kite", "count": 1}], "prompt": "a photo of a kite"}
+{"tag": "two_object", "include": [{"class": "bench", "count": 1}, {"class": "sports ball", "count": 1}], "prompt": "a photo of a bench and a sports ball"}
+{"tag": "two_object", "include": [{"class": "toothbrush", "count": 1}, {"class": "snowboard", "count": 1}], "prompt": "a photo of a toothbrush and a snowboard"}
+{"tag": "two_object", "include": [{"class": "toaster", "count": 1}, {"class": "oven", "count": 1}], "prompt": "a photo of a toaster and an oven"}
+{"tag": "two_object", "include": [{"class": "broccoli", "count": 1}, {"class": "vase", "count": 1}], "prompt": "a photo of a broccoli and a vase"}
+{"tag": "two_object", "include": [{"class": "tennis racket", "count": 1}, {"class": "wine glass", "count": 1}], "prompt": "a photo of a tennis racket and a wine glass"}
+{"tag": "two_object", "include": [{"class": "fork", "count": 1}, {"class": "knife", "count": 1}], "prompt": "a photo of a fork and a knife"}
+{"tag": "two_object", "include": [{"class": "hair drier", "count": 1}, {"class": "cake", "count": 1}], "prompt": "a photo of a hair drier and a cake"}
+{"tag": "two_object", "include": [{"class": "horse", "count": 1}, {"class": "giraffe", "count": 1}], "prompt": "a photo of a horse and a giraffe"}
+{"tag": "two_object", "include": [{"class": "horse", "count": 1}, {"class": "computer keyboard", "count": 1}], "prompt": "a photo of a horse and a computer keyboard"}
+{"tag": "two_object", "include": [{"class": "toothbrush", "count": 1}, {"class": "carrot", "count": 1}], "prompt": "a photo of a toothbrush and a carrot"}
+{"tag": "two_object", "include": [{"class": "cake", "count": 1}, {"class": "zebra", "count": 1}], "prompt": "a photo of a cake and a zebra"}
+{"tag": "two_object", "include": [{"class": "hair drier", "count": 1}, {"class": "bear", "count": 1}], "prompt": "a photo of a hair drier and a bear"}
+{"tag": "two_object", "include": [{"class": "knife", "count": 1}, {"class": "zebra", "count": 1}], "prompt": "a photo of a knife and a zebra"}
+{"tag": "two_object", "include": [{"class": "couch", "count": 1}, {"class": "wine glass", "count": 1}], "prompt": "a photo of a couch and a wine glass"}
+{"tag": "two_object", "include": [{"class": "frisbee", "count": 1}, {"class": "vase", "count": 1}], "prompt": "a photo of a frisbee and a vase"}
+{"tag": "two_object", "include": [{"class": "book", "count": 1}, {"class": "laptop", "count": 1}], "prompt": "a photo of a book and a laptop"}
+{"tag": "two_object", "include": [{"class": "dining table", "count": 1}, {"class": "bear", "count": 1}], "prompt": "a photo of a dining table and a bear"}
+{"tag": "two_object", "include": [{"class": "frisbee", "count": 1}, {"class": "couch", "count": 1}], "prompt": "a photo of a frisbee and a couch"}
+{"tag": "two_object", "include": [{"class": "couch", "count": 1}, {"class": "horse", "count": 1}], "prompt": "a photo of a couch and a horse"}
+{"tag": "two_object", "include": [{"class": "toilet", "count": 1}, {"class": "computer mouse", "count": 1}], "prompt": "a photo of a toilet and a computer mouse"}
+{"tag": "two_object", "include": [{"class": "bottle", "count": 1}, {"class": "refrigerator", "count": 1}], "prompt": "a photo of a bottle and a refrigerator"}
+{"tag": "two_object", "include": [{"class": "potted plant", "count": 1}, {"class": "backpack", "count": 1}], "prompt": "a photo of a potted plant and a backpack"}
+{"tag": "two_object", "include": [{"class": "skateboard", "count": 1}, {"class": "cake", "count": 1}], "prompt": "a photo of a skateboard and a cake"}
+{"tag": "two_object", "include": [{"class": "broccoli", "count": 1}, {"class": "parking meter", "count": 1}], "prompt": "a photo of a broccoli and a parking meter"}
+{"tag": "two_object", "include": [{"class": "zebra", "count": 1}, {"class": "bed", "count": 1}], "prompt": "a photo of a zebra and a bed"}
+{"tag": "two_object", "include": [{"class": "oven", "count": 1}, {"class": "bed", "count": 1}], "prompt": "a photo of an oven and a bed"}
+{"tag": "two_object", "include": [{"class": "baseball bat", "count": 1}, {"class": "fork", "count": 1}], "prompt": "a photo of a baseball bat and a fork"}
+{"tag": "two_object", "include": [{"class": "vase", "count": 1}, {"class": "spoon", "count": 1}], "prompt": "a photo of a vase and a spoon"}
+{"tag": "two_object", "include": [{"class": "skateboard", "count": 1}, {"class": "sink", "count": 1}], "prompt": "a photo of a skateboard and a sink"}
+{"tag": "two_object", "include": [{"class": "pizza", "count": 1}, {"class": "bench", "count": 1}], "prompt": "a photo of a pizza and a bench"}
+{"tag": "two_object", "include": [{"class": "bowl", "count": 1}, {"class": "pizza", "count": 1}], "prompt": "a photo of a bowl and a pizza"}
+{"tag": "two_object", "include": [{"class": "tennis racket", "count": 1}, {"class": "bird", "count": 1}], "prompt": "a photo of a tennis racket and a bird"}
+{"tag": "two_object", "include": [{"class": "wine glass", "count": 1}, {"class": "bear", "count": 1}], "prompt": "a photo of a wine glass and a bear"}
+{"tag": "two_object", "include": [{"class": "fork", "count": 1}, {"class": "book", "count": 1}], "prompt": "a photo of a fork and a book"}
+{"tag": "two_object", "include": [{"class": "scissors", "count": 1}, {"class": "bowl", "count": 1}], "prompt": "a photo of a scissors and a bowl"}
+{"tag": "two_object", "include": [{"class": "laptop", "count": 1}, {"class": "carrot", "count": 1}], "prompt": "a photo of a laptop and a carrot"}
+{"tag": "two_object", "include": [{"class": "stop sign", "count": 1}, {"class": "bottle", "count": 1}], "prompt": "a photo of a stop sign and a bottle"}
+{"tag": "two_object", "include": [{"class": "microwave", "count": 1}, {"class": "truck", "count": 1}], "prompt": "a photo of a microwave and a truck"}
+{"tag": "two_object", "include": [{"class": "person", "count": 1}, {"class": "bear", "count": 1}], "prompt": "a photo of a person and a bear"}
+{"tag": "two_object", "include": [{"class": "frisbee", "count": 1}, {"class": "cell phone", "count": 1}], "prompt": "a photo of a frisbee and a cell phone"}
+{"tag": "two_object", "include": [{"class": "parking meter", "count": 1}, {"class": "teddy bear", "count": 1}], "prompt": "a photo of a parking meter and a teddy bear"}
+{"tag": "two_object", "include": [{"class": "tennis racket", "count": 1}, {"class": "bicycle", "count": 1}], "prompt": "a photo of a tennis racket and a bicycle"}
+{"tag": "two_object", "include": [{"class": "stop sign", "count": 1}, {"class": "motorcycle", "count": 1}], "prompt": "a photo of a stop sign and a motorcycle"}
+{"tag": "two_object", "include": [{"class": "fire hydrant", "count": 1}, {"class": "tennis racket", "count": 1}], "prompt": "a photo of a fire hydrant and a tennis racket"}
+{"tag": "two_object", "include": [{"class": "scissors", "count": 1}, {"class": "sandwich", "count": 1}], "prompt": "a photo of a scissors and a sandwich"}
+{"tag": "two_object", "include": [{"class": "pizza", "count": 1}, {"class": "book", "count": 1}], "prompt": "a photo of a pizza and a book"}
+{"tag": "two_object", "include": [{"class": "giraffe", "count": 1}, {"class": "computer mouse", "count": 1}], "prompt": "a photo of a giraffe and a computer mouse"}
+{"tag": "two_object", "include": [{"class": "stop sign", "count": 1}, {"class": "toaster", "count": 1}], "prompt": "a photo of a stop sign and a toaster"}
+{"tag": "two_object", "include": [{"class": "computer mouse", "count": 1}, {"class": "zebra", "count": 1}], "prompt": "a photo of a computer mouse and a zebra"}
+{"tag": "two_object", "include": [{"class": "chair", "count": 1}, {"class": "bench", "count": 1}], "prompt": "a photo of a chair and a bench"}
+{"tag": "two_object", "include": [{"class": "tv", "count": 1}, {"class": "carrot", "count": 1}], "prompt": "a photo of a tv and a carrot"}
+{"tag": "two_object", "include": [{"class": "surfboard", "count": 1}, {"class": "suitcase", "count": 1}], "prompt": "a photo of a surfboard and a suitcase"}
+{"tag": "two_object", "include": [{"class": "computer keyboard", "count": 1}, {"class": "laptop", "count": 1}], "prompt": "a photo of a computer keyboard and a laptop"}
+{"tag": "two_object", "include": [{"class": "computer keyboard", "count": 1}, {"class": "microwave", "count": 1}], "prompt": "a photo of a computer keyboard and a microwave"}
+{"tag": "two_object", "include": [{"class": "scissors", "count": 1}, {"class": "bird", "count": 1}], "prompt": "a photo of a scissors and a bird"}
+{"tag": "two_object", "include": [{"class": "person", "count": 1}, {"class": "snowboard", "count": 1}], "prompt": "a photo of a person and a snowboard"}
+{"tag": "two_object", "include": [{"class": "cow", "count": 1}, {"class": "horse", "count": 1}], "prompt": "a photo of a cow and a horse"}
+{"tag": "two_object", "include": [{"class": "handbag", "count": 1}, {"class": "refrigerator", "count": 1}], "prompt": "a photo of a handbag and a refrigerator"}
+{"tag": "two_object", "include": [{"class": "chair", "count": 1}, {"class": "laptop", "count": 1}], "prompt": "a photo of a chair and a laptop"}
+{"tag": "two_object", "include": [{"class": "toothbrush", "count": 1}, {"class": "bench", "count": 1}], "prompt": "a photo of a toothbrush and a bench"}
+{"tag": "two_object", "include": [{"class": "book", "count": 1}, {"class": "baseball bat", "count": 1}], "prompt": "a photo of a book and a baseball bat"}
+{"tag": "two_object", "include": [{"class": "horse", "count": 1}, {"class": "train", "count": 1}], "prompt": "a photo of a horse and a train"}
+{"tag": "two_object", "include": [{"class": "bench", "count": 1}, {"class": "vase", "count": 1}], "prompt": "a photo of a bench and a vase"}
+{"tag": "two_object", "include": [{"class": "traffic light", "count": 1}, {"class": "backpack", "count": 1}], "prompt": "a photo of a traffic light and a backpack"}
+{"tag": "two_object", "include": [{"class": "sports ball", "count": 1}, {"class": "cow", "count": 1}], "prompt": "a photo of a sports ball and a cow"}
+{"tag": "two_object", "include": [{"class": "computer mouse", "count": 1}, {"class": "spoon", "count": 1}], "prompt": "a photo of a computer mouse and a spoon"}
+{"tag": "two_object", "include": [{"class": "tv", "count": 1}, {"class": "bicycle", "count": 1}], "prompt": "a photo of a tv and a bicycle"}
+{"tag": "two_object", "include": [{"class": "bench", "count": 1}, {"class": "snowboard", "count": 1}], "prompt": "a photo of a bench and a snowboard"}
+{"tag": "two_object", "include": [{"class": "toothbrush", "count": 1}, {"class": "toilet", "count": 1}], "prompt": "a photo of a toothbrush and a toilet"}
+{"tag": "two_object", "include": [{"class": "person", "count": 1}, {"class": "apple", "count": 1}], "prompt": "a photo of a person and an apple"}
+{"tag": "two_object", "include": [{"class": "sink", "count": 1}, {"class": "sports ball", "count": 1}], "prompt": "a photo of a sink and a sports ball"}
+{"tag": "two_object", "include": [{"class": "stop sign", "count": 1}, {"class": "dog", "count": 1}], "prompt": "a photo of a stop sign and a dog"}
+{"tag": "two_object", "include": [{"class": "knife", "count": 1}, {"class": "stop sign", "count": 1}], "prompt": "a photo of a knife and a stop sign"}
+{"tag": "two_object", "include": [{"class": "wine glass", "count": 1}, {"class": "handbag", "count": 1}], "prompt": "a photo of a wine glass and a handbag"}
+{"tag": "two_object", "include": [{"class": "bowl", "count": 1}, {"class": "skis", "count": 1}], "prompt": "a photo of a bowl and a skis"}
+{"tag": "two_object", "include": [{"class": "frisbee", "count": 1}, {"class": "apple", "count": 1}], "prompt": "a photo of a frisbee and an apple"}
+{"tag": "two_object", "include": [{"class": "computer keyboard", "count": 1}, {"class": "cell phone", "count": 1}], "prompt": "a photo of a computer keyboard and a cell phone"}
+{"tag": "two_object", "include": [{"class": "stop sign", "count": 1}, {"class": "fork", "count": 1}], "prompt": "a photo of a stop sign and a fork"}
+{"tag": "two_object", "include": [{"class": "potted plant", "count": 1}, {"class": "boat", "count": 1}], "prompt": "a photo of a potted plant and a boat"}
+{"tag": "two_object", "include": [{"class": "tv", "count": 1}, {"class": "cell phone", "count": 1}], "prompt": "a photo of a tv and a cell phone"}
+{"tag": "two_object", "include": [{"class": "tie", "count": 1}, {"class": "broccoli", "count": 1}], "prompt": "a photo of a tie and a broccoli"}
+{"tag": "two_object", "include": [{"class": "potted plant", "count": 1}, {"class": "donut", "count": 1}], "prompt": "a photo of a potted plant and a donut"}
+{"tag": "two_object", "include": [{"class": "person", "count": 1}, {"class": "sink", "count": 1}], "prompt": "a photo of a person and a sink"}
+{"tag": "two_object", "include": [{"class": "couch", "count": 1}, {"class": "snowboard", "count": 1}], "prompt": "a photo of a couch and a snowboard"}
+{"tag": "two_object", "include": [{"class": "fork", "count": 1}, {"class": "baseball glove", "count": 1}], "prompt": "a photo of a fork and a baseball glove"}
+{"tag": "two_object", "include": [{"class": "apple", "count": 1}, {"class": "toothbrush", "count": 1}], "prompt": "a photo of an apple and a toothbrush"}
+{"tag": "two_object", "include": [{"class": "bus", "count": 1}, {"class": "baseball glove", "count": 1}], "prompt": "a photo of a bus and a baseball glove"}
+{"tag": "two_object", "include": [{"class": "person", "count": 1}, {"class": "stop sign", "count": 1}], "prompt": "a photo of a person and a stop sign"}
+{"tag": "two_object", "include": [{"class": "carrot", "count": 1}, {"class": "couch", "count": 1}], "prompt": "a photo of a carrot and a couch"}
+{"tag": "two_object", "include": [{"class": "baseball bat", "count": 1}, {"class": "bear", "count": 1}], "prompt": "a photo of a baseball bat and a bear"}
+{"tag": "two_object", "include": [{"class": "fire hydrant", "count": 1}, {"class": "train", "count": 1}], "prompt": "a photo of a fire hydrant and a train"}
+{"tag": "two_object", "include": [{"class": "baseball glove", "count": 1}, {"class": "carrot", "count": 1}], "prompt": "a photo of a baseball glove and a carrot"}
+{"tag": "two_object", "include": [{"class": "microwave", "count": 1}, {"class": "bench", "count": 1}], "prompt": "a photo of a microwave and a bench"}
+{"tag": "two_object", "include": [{"class": "cake", "count": 1}, {"class": "stop sign", "count": 1}], "prompt": "a photo of a cake and a stop sign"}
+{"tag": "two_object", "include": [{"class": "car", "count": 1}, {"class": "computer mouse", "count": 1}], "prompt": "a photo of a car and a computer mouse"}
+{"tag": "two_object", "include": [{"class": "suitcase", "count": 1}, {"class": "dining table", "count": 1}], "prompt": "a photo of a suitcase and a dining table"}
+{"tag": "two_object", "include": [{"class": "person", "count": 1}, {"class": "traffic light", "count": 1}], "prompt": "a photo of a person and a traffic light"}
+{"tag": "two_object", "include": [{"class": "cell phone", "count": 1}, {"class": "horse", "count": 1}], "prompt": "a photo of a cell phone and a horse"}
+{"tag": "two_object", "include": [{"class": "baseball bat", "count": 1}, {"class": "giraffe", "count": 1}], "prompt": "a photo of a baseball bat and a giraffe"}
+{"tag": "counting", "include": [{"class": "clock", "count": 2}], "exclude": [{"class": "clock", "count": 3}], "prompt": "a photo of two clocks"}
+{"tag": "counting", "include": [{"class": "backpack", "count": 2}], "exclude": [{"class": "backpack", "count": 3}], "prompt": "a photo of two backpacks"}
+{"tag": "counting", "include": [{"class": "handbag", "count": 4}], "exclude": [{"class": "handbag", "count": 5}], "prompt": "a photo of four handbags"}
+{"tag": "counting", "include": [{"class": "frisbee", "count": 2}], "exclude": [{"class": "frisbee", "count": 3}], "prompt": "a photo of two frisbees"}
+{"tag": "counting", "include": [{"class": "sports ball", "count": 3}], "exclude": [{"class": "sports ball", "count": 4}], "prompt": "a photo of three sports balls"}
+{"tag": "counting", "include": [{"class": "bear", "count": 2}], "exclude": [{"class": "bear", "count": 3}], "prompt": "a photo of two bears"}
+{"tag": "counting", "include": [{"class": "tie", "count": 2}], "exclude": [{"class": "tie", "count": 3}], "prompt": "a photo of two ties"}
+{"tag": "counting", "include": [{"class": "sink", "count": 4}], "exclude": [{"class": "sink", "count": 5}], "prompt": "a photo of four sinks"}
+{"tag": "counting", "include": [{"class": "toothbrush", "count": 2}], "exclude": [{"class": "toothbrush", "count": 3}], "prompt": "a photo of two toothbrushs"}
+{"tag": "counting", "include": [{"class": "person", "count": 3}], "exclude": [{"class": "person", "count": 4}], "prompt": "a photo of three persons"}
+{"tag": "counting", "include": [{"class": "tennis racket", "count": 3}], "exclude": [{"class": "tennis racket", "count": 4}], "prompt": "a photo of three tennis rackets"}
+{"tag": "counting", "include": [{"class": "bowl", "count": 4}], "exclude": [{"class": "bowl", "count": 5}], "prompt": "a photo of four bowls"}
+{"tag": "counting", "include": [{"class": "vase", "count": 4}], "exclude": [{"class": "vase", "count": 5}], "prompt": "a photo of four vases"}
+{"tag": "counting", "include": [{"class": "cup", "count": 3}], "exclude": [{"class": "cup", "count": 4}], "prompt": "a photo of three cups"}
+{"tag": "counting", "include": [{"class": "computer keyboard", "count": 4}], "exclude": [{"class": "computer keyboard", "count": 5}], "prompt": "a photo of four computer keyboards"}
+{"tag": "counting", "include": [{"class": "sink", "count": 3}], "exclude": [{"class": "sink", "count": 4}], "prompt": "a photo of three sinks"}
+{"tag": "counting", "include": [{"class": "oven", "count": 2}], "exclude": [{"class": "oven", "count": 3}], "prompt": "a photo of two ovens"}
+{"tag": "counting", "include": [{"class": "toilet", "count": 2}], "exclude": [{"class": "toilet", "count": 3}], "prompt": "a photo of two toilets"}
+{"tag": "counting", "include": [{"class": "bicycle", "count": 2}], "exclude": [{"class": "bicycle", "count": 3}], "prompt": "a photo of two bicycles"}
+{"tag": "counting", "include": [{"class": "train", "count": 2}], "exclude": [{"class": "train", "count": 3}], "prompt": "a photo of two trains"}
+{"tag": "counting", "include": [{"class": "orange", "count": 3}], "exclude": [{"class": "orange", "count": 4}], "prompt": "a photo of three oranges"}
+{"tag": "counting", "include": [{"class": "bus", "count": 3}], "exclude": [{"class": "bus", "count": 4}], "prompt": "a photo of three buses"}
+{"tag": "counting", "include": [{"class": "handbag", "count": 3}], "exclude": [{"class": "handbag", "count": 4}], "prompt": "a photo of three handbags"}
+{"tag": "counting", "include": [{"class": "snowboard", "count": 3}], "exclude": [{"class": "snowboard", "count": 4}], "prompt": "a photo of three snowboards"}
+{"tag": "counting", "include": [{"class": "snowboard", "count": 2}], "exclude": [{"class": "snowboard", "count": 3}], "prompt": "a photo of two snowboards"}
+{"tag": "counting", "include": [{"class": "dog", "count": 4}], "exclude": [{"class": "dog", "count": 5}], "prompt": "a photo of four dogs"}
+{"tag": "counting", "include": [{"class": "apple", "count": 3}], "exclude": [{"class": "apple", "count": 4}], "prompt": "a photo of three apples"}
+{"tag": "counting", "include": [{"class": "sheep", "count": 2}], "exclude": [{"class": "sheep", "count": 3}], "prompt": "a photo of two sheeps"}
+{"tag": "counting", "include": [{"class": "hot dog", "count": 3}], "exclude": [{"class": "hot dog", "count": 4}], "prompt": "a photo of three hot dogs"}
+{"tag": "counting", "include": [{"class": "zebra", "count": 3}], "exclude": [{"class": "zebra", "count": 4}], "prompt": "a photo of three zebras"}
+{"tag": "counting", "include": [{"class": "kite", "count": 3}], "exclude": [{"class": "kite", "count": 4}], "prompt": "a photo of three kites"}
+{"tag": "counting", "include": [{"class": "apple", "count": 4}], "exclude": [{"class": "apple", "count": 5}], "prompt": "a photo of four apples"}
+{"tag": "counting", "include": [{"class": "cell phone", "count": 3}], "exclude": [{"class": "cell phone", "count": 4}], "prompt": "a photo of three cell phones"}
+{"tag": "counting", "include": [{"class": "baseball glove", "count": 4}], "exclude": [{"class": "baseball glove", "count": 5}], "prompt": "a photo of four baseball gloves"}
+{"tag": "counting", "include": [{"class": "computer keyboard", "count": 3}], "exclude": [{"class": "computer keyboard", "count": 4}], "prompt": "a photo of three computer keyboards"}
+{"tag": "counting", "include": [{"class": "bed", "count": 2}], "exclude": [{"class": "bed", "count": 3}], "prompt": "a photo of two beds"}
+{"tag": "counting", "include": [{"class": "tv remote", "count": 2}], "exclude": [{"class": "tv remote", "count": 3}], "prompt": "a photo of two tv remotes"}
+{"tag": "counting", "include": [{"class": "fire hydrant", "count": 3}], "exclude": [{"class": "fire hydrant", "count": 4}], "prompt": "a photo of three fire hydrants"}
+{"tag": "counting", "include": [{"class": "book", "count": 3}], "exclude": [{"class": "book", "count": 4}], "prompt": "a photo of three books"}
+{"tag": "counting", "include": [{"class": "giraffe", "count": 4}], "exclude": [{"class": "giraffe", "count": 5}], "prompt": "a photo of four giraffes"}
+{"tag": "counting", "include": [{"class": "vase", "count": 2}], "exclude": [{"class": "vase", "count": 3}], "prompt": "a photo of two vases"}
+{"tag": "counting", "include": [{"class": "donut", "count": 4}], "exclude": [{"class": "donut", "count": 5}], "prompt": "a photo of four donuts"}
+{"tag": "counting", "include": [{"class": "chair", "count": 4}], "exclude": [{"class": "chair", "count": 5}], "prompt": "a photo of four chairs"}
+{"tag": "counting", "include": [{"class": "baseball bat", "count": 3}], "exclude": [{"class": "baseball bat", "count": 4}], "prompt": "a photo of three baseball bats"}
+{"tag": "counting", "include": [{"class": "stop sign", "count": 4}], "exclude": [{"class": "stop sign", "count": 5}], "prompt": "a photo of four stop signs"}
+{"tag": "counting", "include": [{"class": "pizza", "count": 2}], "exclude": [{"class": "pizza", "count": 3}], "prompt": "a photo of two pizzas"}
+{"tag": "counting", "include": [{"class": "refrigerator", "count": 3}], "exclude": [{"class": "refrigerator", "count": 4}], "prompt": "a photo of three refrigerators"}
+{"tag": "counting", "include": [{"class": "fire hydrant", "count": 2}], "exclude": [{"class": "fire hydrant", "count": 3}], "prompt": "a photo of two fire hydrants"}
+{"tag": "counting", "include": [{"class": "giraffe", "count": 3}], "exclude": [{"class": "giraffe", "count": 4}], "prompt": "a photo of three giraffes"}
+{"tag": "counting", "include": [{"class": "tv", "count": 4}], "exclude": [{"class": "tv", "count": 5}], "prompt": "a photo of four tvs"}
+{"tag": "counting", "include": [{"class": "wine glass", "count": 3}], "exclude": [{"class": "wine glass", "count": 4}], "prompt": "a photo of three wine glasses"}
+{"tag": "counting", "include": [{"class": "broccoli", "count": 4}], "exclude": [{"class": "broccoli", "count": 5}], "prompt": "a photo of four broccolis"}
+{"tag": "counting", "include": [{"class": "truck", "count": 3}], "exclude": [{"class": "truck", "count": 4}], "prompt": "a photo of three trucks"}
+{"tag": "counting", "include": [{"class": "truck", "count": 2}], "exclude": [{"class": "truck", "count": 3}], "prompt": "a photo of two trucks"}
+{"tag": "counting", "include": [{"class": "carrot", "count": 2}], "exclude": [{"class": "carrot", "count": 3}], "prompt": "a photo of two carrots"}
+{"tag": "counting", "include": [{"class": "sandwich", "count": 2}], "exclude": [{"class": "sandwich", "count": 3}], "prompt": "a photo of two sandwichs"}
+{"tag": "counting", "include": [{"class": "traffic light", "count": 4}], "exclude": [{"class": "traffic light", "count": 5}], "prompt": "a photo of four traffic lights"}
+{"tag": "counting", "include": [{"class": "clock", "count": 4}], "exclude": [{"class": "clock", "count": 5}], "prompt": "a photo of four clocks"}
+{"tag": "counting", "include": [{"class": "car", "count": 2}], "exclude": [{"class": "car", "count": 3}], "prompt": "a photo of two cars"}
+{"tag": "counting", "include": [{"class": "banana", "count": 2}], "exclude": [{"class": "banana", "count": 3}], "prompt": "a photo of two bananas"}
+{"tag": "counting", "include": [{"class": "wine glass", "count": 2}], "exclude": [{"class": "wine glass", "count": 3}], "prompt": "a photo of two wine glasses"}
+{"tag": "counting", "include": [{"class": "pizza", "count": 3}], "exclude": [{"class": "pizza", "count": 4}], "prompt": "a photo of three pizzas"}
+{"tag": "counting", "include": [{"class": "knife", "count": 4}], "exclude": [{"class": "knife", "count": 5}], "prompt": "a photo of four knifes"}
+{"tag": "counting", "include": [{"class": "suitcase", "count": 3}], "exclude": [{"class": "suitcase", "count": 4}], "prompt": "a photo of three suitcases"}
+{"tag": "counting", "include": [{"class": "zebra", "count": 4}], "exclude": [{"class": "zebra", "count": 5}], "prompt": "a photo of four zebras"}
+{"tag": "counting", "include": [{"class": "teddy bear", "count": 2}], "exclude": [{"class": "teddy bear", "count": 3}], "prompt": "a photo of two teddy bears"}
+{"tag": "counting", "include": [{"class": "skateboard", "count": 4}], "exclude": [{"class": "skateboard", "count": 5}], "prompt": "a photo of four skateboards"}
+{"tag": "counting", "include": [{"class": "hot dog", "count": 4}], "exclude": [{"class": "hot dog", "count": 5}], "prompt": "a photo of four hot dogs"}
+{"tag": "counting", "include": [{"class": "bird", "count": 3}], "exclude": [{"class": "bird", "count": 4}], "prompt": "a photo of three birds"}
+{"tag": "counting", "include": [{"class": "boat", "count": 4}], "exclude": [{"class": "boat", "count": 5}], "prompt": "a photo of four boats"}
+{"tag": "counting", "include": [{"class": "microwave", "count": 4}], "exclude": [{"class": "microwave", "count": 5}], "prompt": "a photo of four microwaves"}
+{"tag": "counting", "include": [{"class": "hair drier", "count": 2}], "exclude": [{"class": "hair drier", "count": 3}], "prompt": "a photo of two hair driers"}
+{"tag": "counting", "include": [{"class": "laptop", "count": 3}], "exclude": [{"class": "laptop", "count": 4}], "prompt": "a photo of three laptops"}
+{"tag": "counting", "include": [{"class": "cow", "count": 3}], "exclude": [{"class": "cow", "count": 4}], "prompt": "a photo of three cows"}
+{"tag": "counting", "include": [{"class": "parking meter", "count": 2}], "exclude": [{"class": "parking meter", "count": 3}], "prompt": "a photo of two parking meters"}
+{"tag": "counting", "include": [{"class": "bench", "count": 4}], "exclude": [{"class": "bench", "count": 5}], "prompt": "a photo of four benchs"}
+{"tag": "counting", "include": [{"class": "bench", "count": 3}], "exclude": [{"class": "bench", "count": 4}], "prompt": "a photo of three benchs"}
+{"tag": "counting", "include": [{"class": "frisbee", "count": 4}], "exclude": [{"class": "frisbee", "count": 5}], "prompt": "a photo of four frisbees"}
+{"tag": "counting", "include": [{"class": "book", "count": 4}], "exclude": [{"class": "book", "count": 5}], "prompt": "a photo of four books"}
+{"tag": "counting", "include": [{"class": "bus", "count": 4}], "exclude": [{"class": "bus", "count": 5}], "prompt": "a photo of four buses"}
+{"tag": "colors", "include": [{"class": "fire hydrant", "count": 1, "color": "blue"}], "prompt": "a photo of a blue fire hydrant"}
+{"tag": "colors", "include": [{"class": "car", "count": 1, "color": "pink"}], "prompt": "a photo of a pink car"}
+{"tag": "colors", "include": [{"class": "cup", "count": 1, "color": "purple"}], "prompt": "a photo of a purple cup"}
+{"tag": "colors", "include": [{"class": "cow", "count": 1, "color": "blue"}], "prompt": "a photo of a blue cow"}
+{"tag": "colors", "include": [{"class": "boat", "count": 1, "color": "yellow"}], "prompt": "a photo of a yellow boat"}
+{"tag": "colors", "include": [{"class": "umbrella", "count": 1, "color": "blue"}], "prompt": "a photo of a blue umbrella"}
+{"tag": "colors", "include": [{"class": "elephant", "count": 1, "color": "blue"}], "prompt": "a photo of a blue elephant"}
+{"tag": "colors", "include": [{"class": "elephant", "count": 1, "color": "yellow"}], "prompt": "a photo of a yellow elephant"}
+{"tag": "colors", "include": [{"class": "bicycle", "count": 1, "color": "red"}], "prompt": "a photo of a red bicycle"}
+{"tag": "colors", "include": [{"class": "suitcase", "count": 1, "color": "purple"}], "prompt": "a photo of a purple suitcase"}
+{"tag": "colors", "include": [{"class": "hair drier", "count": 1, "color": "purple"}], "prompt": "a photo of a purple hair drier"}
+{"tag": "colors", "include": [{"class": "sandwich", "count": 1, "color": "white"}], "prompt": "a photo of a white sandwich"}
+{"tag": "colors", "include": [{"class": "elephant", "count": 1, "color": "purple"}], "prompt": "a photo of a purple elephant"}
+{"tag": "colors", "include": [{"class": "microwave", "count": 1, "color": "green"}], "prompt": "a photo of a green microwave"}
+{"tag": "colors", "include": [{"class": "zebra", "count": 1, "color": "red"}], "prompt": "a photo of a red zebra"}
+{"tag": "colors", "include": [{"class": "apple", "count": 1, "color": "red"}], "prompt": "a photo of a red apple"}
+{"tag": "colors", "include": [{"class": "tv remote", "count": 1, "color": "yellow"}], "prompt": "a photo of a yellow tv remote"}
+{"tag": "colors", "include": [{"class": "toilet", "count": 1, "color": "blue"}], "prompt": "a photo of a blue toilet"}
+{"tag": "colors", "include": [{"class": "orange", "count": 1, "color": "orange"}], "prompt": "a photo of an orange orange"}
+{"tag": "colors", "include": [{"class": "donut", "count": 1, "color": "black"}], "prompt": "a photo of a black donut"}
+{"tag": "colors", "include": [{"class": "vase", "count": 1, "color": "red"}], "prompt": "a photo of a red vase"}
+{"tag": "colors", "include": [{"class": "pizza", "count": 1, "color": "purple"}], "prompt": "a photo of a purple pizza"}
+{"tag": "colors", "include": [{"class": "skateboard", "count": 1, "color": "pink"}], "prompt": "a photo of a pink skateboard"}
+{"tag": "colors", "include": [{"class": "skateboard", "count": 1, "color": "green"}], "prompt": "a photo of a green skateboard"}
+{"tag": "colors", "include": [{"class": "bear", "count": 1, "color": "purple"}], "prompt": "a photo of a purple bear"}
+{"tag": "colors", "include": [{"class": "chair", "count": 1, "color": "brown"}], "prompt": "a photo of a brown chair"}
+{"tag": "colors", "include": [{"class": "computer keyboard", "count": 1, "color": "brown"}], "prompt": "a photo of a brown computer keyboard"}
+{"tag": "colors", "include": [{"class": "cow", "count": 1, "color": "orange"}], "prompt": "a photo of an orange cow"}
+{"tag": "colors", "include": [{"class": "skis", "count": 1, "color": "brown"}], "prompt": "a photo of a brown skis"}
+{"tag": "colors", "include": [{"class": "kite", "count": 1, "color": "white"}], "prompt": "a photo of a white kite"}
+{"tag": "colors", "include": [{"class": "dog", "count": 1, "color": "red"}], "prompt": "a photo of a red dog"}
+{"tag": "colors", "include": [{"class": "couch", "count": 1, "color": "green"}], "prompt": "a photo of a green couch"}
+{"tag": "colors", "include": [{"class": "airplane", "count": 1, "color": "yellow"}], "prompt": "a photo of a yellow airplane"}
+{"tag": "colors", "include": [{"class": "tv", "count": 1, "color": "orange"}], "prompt": "a photo of an orange tv"}
+{"tag": "colors", "include": [{"class": "scissors", "count": 1, "color": "white"}], "prompt": "a photo of a white scissors"}
+{"tag": "colors", "include": [{"class": "cell phone", "count": 1, "color": "pink"}], "prompt": "a photo of a pink cell phone"}
+{"tag": "colors", "include": [{"class": "surfboard", "count": 1, "color": "green"}], "prompt": "a photo of a green surfboard"}
+{"tag": "colors", "include": [{"class": "fire hydrant", "count": 1, "color": "white"}], "prompt": "a photo of a white fire hydrant"}
+{"tag": "colors", "include": [{"class": "bicycle", "count": 1, "color": "black"}], "prompt": "a photo of a black bicycle"}
+{"tag": "colors", "include": [{"class": "carrot", "count": 1, "color": "purple"}], "prompt": "a photo of a purple carrot"}
+{"tag": "colors", "include": [{"class": "dining table", "count": 1, "color": "black"}], "prompt": "a photo of a black dining table"}
+{"tag": "colors", "include": [{"class": "potted plant", "count": 1, "color": "purple"}], "prompt": "a photo of a purple potted plant"}
+{"tag": "colors", "include": [{"class": "backpack", "count": 1, "color": "purple"}], "prompt": "a photo of a purple backpack"}
+{"tag": "colors", "include": [{"class": "train", "count": 1, "color": "yellow"}], "prompt": "a photo of a yellow train"}
+{"tag": "colors", "include": [{"class": "potted plant", "count": 1, "color": "pink"}], "prompt": "a photo of a pink potted plant"}
+{"tag": "colors", "include": [{"class": "giraffe", "count": 1, "color": "red"}], "prompt": "a photo of a red giraffe"}
+{"tag": "colors", "include": [{"class": "bear", "count": 1, "color": "brown"}], "prompt": "a photo of a brown bear"}
+{"tag": "colors", "include": [{"class": "train", "count": 1, "color": "black"}], "prompt": "a photo of a black train"}
+{"tag": "colors", "include": [{"class": "laptop", "count": 1, "color": "orange"}], "prompt": "a photo of an orange laptop"}
+{"tag": "colors", "include": [{"class": "hot dog", "count": 1, "color": "green"}], "prompt": "a photo of a green hot dog"}
+{"tag": "colors", "include": [{"class": "parking meter", "count": 1, "color": "yellow"}], "prompt": "a photo of a yellow parking meter"}
+{"tag": "colors", "include": [{"class": "potted plant", "count": 1, "color": "red"}], "prompt": "a photo of a red potted plant"}
+{"tag": "colors", "include": [{"class": "traffic light", "count": 1, "color": "green"}], "prompt": "a photo of a green traffic light"}
+{"tag": "colors", "include": [{"class": "tv", "count": 1, "color": "blue"}], "prompt": "a photo of a blue tv"}
+{"tag": "colors", "include": [{"class": "refrigerator", "count": 1, "color": "brown"}], "prompt": "a photo of a brown refrigerator"}
+{"tag": "colors", "include": [{"class": "tv remote", "count": 1, "color": "black"}], "prompt": "a photo of a black tv remote"}
+{"tag": "colors", "include": [{"class": "scissors", "count": 1, "color": "purple"}], "prompt": "a photo of a purple scissors"}
+{"tag": "colors", "include": [{"class": "orange", "count": 1, "color": "yellow"}], "prompt": "a photo of a yellow orange"}
+{"tag": "colors", "include": [{"class": "toaster", "count": 1, "color": "brown"}], "prompt": "a photo of a brown toaster"}
+{"tag": "colors", "include": [{"class": "parking meter", "count": 1, "color": "red"}], "prompt": "a photo of a red parking meter"}
+{"tag": "colors", "include": [{"class": "orange", "count": 1, "color": "brown"}], "prompt": "a photo of a brown orange"}
+{"tag": "colors", "include": [{"class": "clock", "count": 1, "color": "green"}], "prompt": "a photo of a green clock"}
+{"tag": "colors", "include": [{"class": "sheep", "count": 1, "color": "white"}], "prompt": "a photo of a white sheep"}
+{"tag": "colors", "include": [{"class": "oven", "count": 1, "color": "yellow"}], "prompt": "a photo of a yellow oven"}
+{"tag": "colors", "include": [{"class": "vase", "count": 1, "color": "green"}], "prompt": "a photo of a green vase"}
+{"tag": "colors", "include": [{"class": "teddy bear", "count": 1, "color": "black"}], "prompt": "a photo of a black teddy bear"}
+{"tag": "colors", "include": [{"class": "carrot", "count": 1, "color": "yellow"}], "prompt": "a photo of a yellow carrot"}
+{"tag": "colors", "include": [{"class": "hot dog", "count": 1, "color": "black"}], "prompt": "a photo of a black hot dog"}
+{"tag": "colors", "include": [{"class": "scissors", "count": 1, "color": "red"}], "prompt": "a photo of a red scissors"}
+{"tag": "colors", "include": [{"class": "teddy bear", "count": 1, "color": "white"}], "prompt": "a photo of a white teddy bear"}
+{"tag": "colors", "include": [{"class": "skis", "count": 1, "color": "black"}], "prompt": "a photo of a black skis"}
+{"tag": "colors", "include": [{"class": "dining table", "count": 1, "color": "blue"}], "prompt": "a photo of a blue dining table"}
+{"tag": "colors", "include": [{"class": "refrigerator", "count": 1, "color": "black"}], "prompt": "a photo of a black refrigerator"}
+{"tag": "colors", "include": [{"class": "dog", "count": 1, "color": "white"}], "prompt": "a photo of a white dog"}
+{"tag": "colors", "include": [{"class": "scissors", "count": 1, "color": "orange"}], "prompt": "a photo of an orange scissors"}
+{"tag": "colors", "include": [{"class": "cell phone", "count": 1, "color": "red"}], "prompt": "a photo of a red cell phone"}
+{"tag": "colors", "include": [{"class": "orange", "count": 1, "color": "white"}], "prompt": "a photo of a white orange"}
+{"tag": "colors", "include": [{"class": "clock", "count": 1, "color": "blue"}], "prompt": "a photo of a blue clock"}
+{"tag": "colors", "include": [{"class": "carrot", "count": 1, "color": "blue"}], "prompt": "a photo of a blue carrot"}
+{"tag": "colors", "include": [{"class": "motorcycle", "count": 1, "color": "green"}], "prompt": "a photo of a green motorcycle"}
+{"tag": "colors", "include": [{"class": "stop sign", "count": 1, "color": "pink"}], "prompt": "a photo of a pink stop sign"}
+{"tag": "colors", "include": [{"class": "vase", "count": 1, "color": "black"}], "prompt": "a photo of a black vase"}
+{"tag": "colors", "include": [{"class": "backpack", "count": 1, "color": "black"}], "prompt": "a photo of a black backpack"}
+{"tag": "colors", "include": [{"class": "car", "count": 1, "color": "red"}], "prompt": "a photo of a red car"}
+{"tag": "colors", "include": [{"class": "computer mouse", "count": 1, "color": "green"}], "prompt": "a photo of a green computer mouse"}
+{"tag": "colors", "include": [{"class": "backpack", "count": 1, "color": "red"}], "prompt": "a photo of a red backpack"}
+{"tag": "colors", "include": [{"class": "bus", "count": 1, "color": "green"}], "prompt": "a photo of a green bus"}
+{"tag": "colors", "include": [{"class": "toaster", "count": 1, "color": "orange"}], "prompt": "a photo of an orange toaster"}
+{"tag": "colors", "include": [{"class": "fork", "count": 1, "color": "yellow"}], "prompt": "a photo of a yellow fork"}
+{"tag": "colors", "include": [{"class": "parking meter", "count": 1, "color": "pink"}], "prompt": "a photo of a pink parking meter"}
+{"tag": "colors", "include": [{"class": "book", "count": 1, "color": "blue"}], "prompt": "a photo of a blue book"}
+{"tag": "colors", "include": [{"class": "broccoli", "count": 1, "color": "yellow"}], "prompt": "a photo of a yellow broccoli"}
+{"tag": "colors", "include": [{"class": "computer mouse", "count": 1, "color": "orange"}], "prompt": "a photo of an orange computer mouse"}
+{"tag": "colors", "include": [{"class": "cake", "count": 1, "color": "red"}], "prompt": "a photo of a red cake"}
+{"tag": "position", "include": [{"class": "teddy bear", "count": 1}, {"class": "dog", "count": 1, "position": ["right of", 0]}], "prompt": "a photo of a dog right of a teddy bear"}
+{"tag": "position", "include": [{"class": "kite", "count": 1}, {"class": "wine glass", "count": 1, "position": ["above", 0]}], "prompt": "a photo of a wine glass above a kite"}
+{"tag": "position", "include": [{"class": "cup", "count": 1}, {"class": "couch", "count": 1, "position": ["below", 0]}], "prompt": "a photo of a couch below a cup"}
+{"tag": "position", "include": [{"class": "cow", "count": 1}, {"class": "laptop", "count": 1, "position": ["left of", 0]}], "prompt": "a photo of a laptop left of a cow"}
+{"tag": "position", "include": [{"class": "hair drier", "count": 1}, {"class": "fork", "count": 1, "position": ["above", 0]}], "prompt": "a photo of a fork above a hair drier"}
+{"tag": "position", "include": [{"class": "baseball bat", "count": 1}, {"class": "tie", "count": 1, "position": ["right of", 0]}], "prompt": "a photo of a tie right of a baseball bat"}
+{"tag": "position", "include": [{"class": "fork", "count": 1}, {"class": "stop sign", "count": 1, "position": ["above", 0]}], "prompt": "a photo of a stop sign above a fork"}
+{"tag": "position", "include": [{"class": "skateboard", "count": 1}, {"class": "bird", "count": 1, "position": ["below", 0]}], "prompt": "a photo of a bird below a skateboard"}
+{"tag": "position", "include": [{"class": "tv", "count": 1}, {"class": "apple", "count": 1, "position": ["above", 0]}], "prompt": "a photo of an apple above a tv"}
+{"tag": "position", "include": [{"class": "potted plant", "count": 1}, {"class": "train", "count": 1, "position": ["above", 0]}], "prompt": "a photo of a train above a potted plant"}
+{"tag": "position", "include": [{"class": "refrigerator", "count": 1}, {"class": "truck", "count": 1, "position": ["left of", 0]}], "prompt": "a photo of a truck left of a refrigerator"}
+{"tag": "position", "include": [{"class": "cow", "count": 1}, {"class": "tv remote", "count": 1, "position": ["below", 0]}], "prompt": "a photo of a tv remote below a cow"}
+{"tag": "position", "include": [{"class": "train", "count": 1}, {"class": "bottle", "count": 1, "position": ["right of", 0]}], "prompt": "a photo of a bottle right of a train"}
+{"tag": "position", "include": [{"class": "cow", "count": 1}, {"class": "dog", "count": 1, "position": ["above", 0]}], "prompt": "a photo of a dog above a cow"}
+{"tag": "position", "include": [{"class": "person", "count": 1}, {"class": "skateboard", "count": 1, "position": ["above", 0]}], "prompt": "a photo of a skateboard above a person"}
+{"tag": "position", "include": [{"class": "umbrella", "count": 1}, {"class": "baseball glove", "count": 1, "position": ["below", 0]}], "prompt": "a photo of a baseball glove below an umbrella"}
+{"tag": "position", "include": [{"class": "oven", "count": 1}, {"class": "dining table", "count": 1, "position": ["right of", 0]}], "prompt": "a photo of a dining table right of an oven"}
+{"tag": "position", "include": [{"class": "suitcase", "count": 1}, {"class": "hot dog", "count": 1, "position": ["left of", 0]}], "prompt": "a photo of a hot dog left of a suitcase"}
+{"tag": "position", "include": [{"class": "toothbrush", "count": 1}, {"class": "bus", "count": 1, "position": ["below", 0]}], "prompt": "a photo of a bus below a toothbrush"}
+{"tag": "position", "include": [{"class": "sandwich", "count": 1}, {"class": "backpack", "count": 1, "position": ["right of", 0]}], "prompt": "a photo of a backpack right of a sandwich"}
+{"tag": "position", "include": [{"class": "baseball bat", "count": 1}, {"class": "cake", "count": 1, "position": ["below", 0]}], "prompt": "a photo of a cake below a baseball bat"}
+{"tag": "position", "include": [{"class": "tie", "count": 1}, {"class": "dog", "count": 1, "position": ["right of", 0]}], "prompt": "a photo of a dog right of a tie"}
+{"tag": "position", "include": [{"class": "boat", "count": 1}, {"class": "suitcase", "count": 1, "position": ["right of", 0]}], "prompt": "a photo of a suitcase right of a boat"}
+{"tag": "position", "include": [{"class": "clock", "count": 1}, {"class": "bear", "count": 1, "position": ["above", 0]}], "prompt": "a photo of a bear above a clock"}
+{"tag": "position", "include": [{"class": "umbrella", "count": 1}, {"class": "tv remote", "count": 1, "position": ["left of", 0]}], "prompt": "a photo of a tv remote left of an umbrella"}
+{"tag": "position", "include": [{"class": "umbrella", "count": 1}, {"class": "sports ball", "count": 1, "position": ["left of", 0]}], "prompt": "a photo of a sports ball left of an umbrella"}
+{"tag": "position", "include": [{"class": "dining table", "count": 1}, {"class": "train", "count": 1, "position": ["right of", 0]}], "prompt": "a photo of a train right of a dining table"}
+{"tag": "position", "include": [{"class": "elephant", "count": 1}, {"class": "hair drier", "count": 1, "position": ["below", 0]}], "prompt": "a photo of a hair drier below an elephant"}
+{"tag": "position", "include": [{"class": "spoon", "count": 1}, {"class": "tennis racket", "count": 1, "position": ["right of", 0]}], "prompt": "a photo of a tennis racket right of a spoon"}
+{"tag": "position", "include": [{"class": "hot dog", "count": 1}, {"class": "wine glass", "count": 1, "position": ["right of", 0]}], "prompt": "a photo of a wine glass right of a hot dog"}
+{"tag": "position", "include": [{"class": "bench", "count": 1}, {"class": "computer mouse", "count": 1, "position": ["left of", 0]}], "prompt": "a photo of a computer mouse left of a bench"}
+{"tag": "position", "include": [{"class": "orange", "count": 1}, {"class": "carrot", "count": 1, "position": ["left of", 0]}], "prompt": "a photo of a carrot left of an orange"}
+{"tag": "position", "include": [{"class": "toothbrush", "count": 1}, {"class": "kite", "count": 1, "position": ["above", 0]}], "prompt": "a photo of a kite above a toothbrush"}
+{"tag": "position", "include": [{"class": "traffic light", "count": 1}, {"class": "toaster", "count": 1, "position": ["below", 0]}], "prompt": "a photo of a toaster below a traffic light"}
+{"tag": "position", "include": [{"class": "baseball glove", "count": 1}, {"class": "cat", "count": 1, "position": ["below", 0]}], "prompt": "a photo of a cat below a baseball glove"}
+{"tag": "position", "include": [{"class": "zebra", "count": 1}, {"class": "skis", "count": 1, "position": ["right of", 0]}], "prompt": "a photo of a skis right of a zebra"}
+{"tag": "position", "include": [{"class": "chair", "count": 1}, {"class": "stop sign", "count": 1, "position": ["above", 0]}], "prompt": "a photo of a stop sign above a chair"}
+{"tag": "position", "include": [{"class": "parking meter", "count": 1}, {"class": "stop sign", "count": 1, "position": ["above", 0]}], "prompt": "a photo of a stop sign above a parking meter"}
+{"tag": "position", "include": [{"class": "skateboard", "count": 1}, {"class": "hot dog", "count": 1, "position": ["right of", 0]}], "prompt": "a photo of a hot dog right of a skateboard"}
+{"tag": "position", "include": [{"class": "computer keyboard", "count": 1}, {"class": "pizza", "count": 1, "position": ["below", 0]}], "prompt": "a photo of a pizza below a computer keyboard"}
+{"tag": "position", "include": [{"class": "toilet", "count": 1}, {"class": "hair drier", "count": 1, "position": ["left of", 0]}], "prompt": "a photo of a hair drier left of a toilet"}
+{"tag": "position", "include": [{"class": "stop sign", "count": 1}, {"class": "cow", "count": 1, "position": ["left of", 0]}], "prompt": "a photo of a cow left of a stop sign"}
+{"tag": "position", "include": [{"class": "skis", "count": 1}, {"class": "suitcase", "count": 1, "position": ["above", 0]}], "prompt": "a photo of a suitcase above a skis"}
+{"tag": "position", "include": [{"class": "laptop", "count": 1}, {"class": "book", "count": 1, "position": ["above", 0]}], "prompt": "a photo of a book above a laptop"}
+{"tag": "position", "include": [{"class": "pizza", "count": 1}, {"class": "toothbrush", "count": 1, "position": ["below", 0]}], "prompt": "a photo of a toothbrush below a pizza"}
+{"tag": "position", "include": [{"class": "kite", "count": 1}, {"class": "toilet", "count": 1, "position": ["left of", 0]}], "prompt": "a photo of a toilet left of a kite"}
+{"tag": "position", "include": [{"class": "sink", "count": 1}, {"class": "tie", "count": 1, "position": ["above", 0]}], "prompt": "a photo of a tie above a sink"}
+{"tag": "position", "include": [{"class": "couch", "count": 1}, {"class": "bird", "count": 1, "position": ["left of", 0]}], "prompt": "a photo of a bird left of a couch"}
+{"tag": "position", "include": [{"class": "sports ball", "count": 1}, {"class": "bed", "count": 1, "position": ["right of", 0]}], "prompt": "a photo of a bed right of a sports ball"}
+{"tag": "position", "include": [{"class": "surfboard", "count": 1}, {"class": "elephant", "count": 1, "position": ["below", 0]}], "prompt": "a photo of an elephant below a surfboard"}
+{"tag": "position", "include": [{"class": "motorcycle", "count": 1}, {"class": "frisbee", "count": 1, "position": ["right of", 0]}], "prompt": "a photo of a frisbee right of a motorcycle"}
+{"tag": "position", "include": [{"class": "fire hydrant", "count": 1}, {"class": "vase", "count": 1, "position": ["above", 0]}], "prompt": "a photo of a vase above a fire hydrant"}
+{"tag": "position", "include": [{"class": "elephant", "count": 1}, {"class": "zebra", "count": 1, "position": ["left of", 0]}], "prompt": "a photo of a zebra left of an elephant"}
+{"tag": "position", "include": [{"class": "bear", "count": 1}, {"class": "bench", "count": 1, "position": ["left of", 0]}], "prompt": "a photo of a bench left of a bear"}
+{"tag": "position", "include": [{"class": "bench", "count": 1}, {"class": "donut", "count": 1, "position": ["right of", 0]}], "prompt": "a photo of a donut right of a bench"}
+{"tag": "position", "include": [{"class": "horse", "count": 1}, {"class": "frisbee", "count": 1, "position": ["below", 0]}], "prompt": "a photo of a frisbee below a horse"}
+{"tag": "position", "include": [{"class": "snowboard", "count": 1}, {"class": "computer keyboard", "count": 1, "position": ["above", 0]}], "prompt": "a photo of a computer keyboard above a snowboard"}
+{"tag": "position", "include": [{"class": "cow", "count": 1}, {"class": "tv", "count": 1, "position": ["below", 0]}], "prompt": "a photo of a tv below a cow"}
+{"tag": "position", "include": [{"class": "horse", "count": 1}, {"class": "elephant", "count": 1, "position": ["below", 0]}], "prompt": "a photo of an elephant below a horse"}
+{"tag": "position", "include": [{"class": "banana", "count": 1}, {"class": "suitcase", "count": 1, "position": ["left of", 0]}], "prompt": "a photo of a suitcase left of a banana"}
+{"tag": "position", "include": [{"class": "airplane", "count": 1}, {"class": "train", "count": 1, "position": ["below", 0]}], "prompt": "a photo of a train below an airplane"}
+{"tag": "position", "include": [{"class": "backpack", "count": 1}, {"class": "cat", "count": 1, "position": ["below", 0]}], "prompt": "a photo of a cat below a backpack"}
+{"tag": "position", "include": [{"class": "cake", "count": 1}, {"class": "backpack", "count": 1, "position": ["below", 0]}], "prompt": "a photo of a backpack below a cake"}
+{"tag": "position", "include": [{"class": "knife", "count": 1}, {"class": "sandwich", "count": 1, "position": ["below", 0]}], "prompt": "a photo of a sandwich below a knife"}
+{"tag": "position", "include": [{"class": "parking meter", "count": 1}, {"class": "bicycle", "count": 1, "position": ["above", 0]}], "prompt": "a photo of a bicycle above a parking meter"}
+{"tag": "position", "include": [{"class": "suitcase", "count": 1}, {"class": "knife", "count": 1, "position": ["right of", 0]}], "prompt": "a photo of a knife right of a suitcase"}
+{"tag": "position", "include": [{"class": "knife", "count": 1}, {"class": "hot dog", "count": 1, "position": ["above", 0]}], "prompt": "a photo of a hot dog above a knife"}
+{"tag": "position", "include": [{"class": "parking meter", "count": 1}, {"class": "zebra", "count": 1, "position": ["right of", 0]}], "prompt": "a photo of a zebra right of a parking meter"}
+{"tag": "position", "include": [{"class": "zebra", "count": 1}, {"class": "chair", "count": 1, "position": ["left of", 0]}], "prompt": "a photo of a chair left of a zebra"}
+{"tag": "position", "include": [{"class": "airplane", "count": 1}, {"class": "cow", "count": 1, "position": ["below", 0]}], "prompt": "a photo of a cow below an airplane"}
+{"tag": "position", "include": [{"class": "umbrella", "count": 1}, {"class": "cup", "count": 1, "position": ["left of", 0]}], "prompt": "a photo of a cup left of an umbrella"}
+{"tag": "position", "include": [{"class": "computer keyboard", "count": 1}, {"class": "zebra", "count": 1, "position": ["below", 0]}], "prompt": "a photo of a zebra below a computer keyboard"}
+{"tag": "position", "include": [{"class": "broccoli", "count": 1}, {"class": "zebra", "count": 1, "position": ["below", 0]}], "prompt": "a photo of a zebra below a broccoli"}
+{"tag": "position", "include": [{"class": "sports ball", "count": 1}, {"class": "laptop", "count": 1, "position": ["below", 0]}], "prompt": "a photo of a laptop below a sports ball"}
+{"tag": "position", "include": [{"class": "baseball bat", "count": 1}, {"class": "truck", "count": 1, "position": ["left of", 0]}], "prompt": "a photo of a truck left of a baseball bat"}
+{"tag": "position", "include": [{"class": "baseball bat", "count": 1}, {"class": "refrigerator", "count": 1, "position": ["above", 0]}], "prompt": "a photo of a refrigerator above a baseball bat"}
+{"tag": "position", "include": [{"class": "baseball bat", "count": 1}, {"class": "tv", "count": 1, "position": ["above", 0]}], "prompt": "a photo of a tv above a baseball bat"}
+{"tag": "position", "include": [{"class": "bear", "count": 1}, {"class": "baseball glove", "count": 1, "position": ["right of", 0]}], "prompt": "a photo of a baseball glove right of a bear"}
+{"tag": "position", "include": [{"class": "scissors", "count": 1}, {"class": "refrigerator", "count": 1, "position": ["below", 0]}], "prompt": "a photo of a refrigerator below a scissors"}
+{"tag": "position", "include": [{"class": "suitcase", "count": 1}, {"class": "dining table", "count": 1, "position": ["above", 0]}], "prompt": "a photo of a dining table above a suitcase"}
+{"tag": "position", "include": [{"class": "broccoli", "count": 1}, {"class": "parking meter", "count": 1, "position": ["above", 0]}], "prompt": "a photo of a parking meter above a broccoli"}
+{"tag": "position", "include": [{"class": "truck", "count": 1}, {"class": "frisbee", "count": 1, "position": ["above", 0]}], "prompt": "a photo of a frisbee above a truck"}
+{"tag": "position", "include": [{"class": "banana", "count": 1}, {"class": "pizza", "count": 1, "position": ["right of", 0]}], "prompt": "a photo of a pizza right of a banana"}
+{"tag": "position", "include": [{"class": "boat", "count": 1}, {"class": "bus", "count": 1, "position": ["above", 0]}], "prompt": "a photo of a bus above a boat"}
+{"tag": "position", "include": [{"class": "tennis racket", "count": 1}, {"class": "cell phone", "count": 1, "position": ["left of", 0]}], "prompt": "a photo of a cell phone left of a tennis racket"}
+{"tag": "position", "include": [{"class": "broccoli", "count": 1}, {"class": "horse", "count": 1, "position": ["right of", 0]}], "prompt": "a photo of a horse right of a broccoli"}
+{"tag": "position", "include": [{"class": "bottle", "count": 1}, {"class": "broccoli", "count": 1, "position": ["above", 0]}], "prompt": "a photo of a broccoli above a bottle"}
+{"tag": "position", "include": [{"class": "horse", "count": 1}, {"class": "vase", "count": 1, "position": ["right of", 0]}], "prompt": "a photo of a vase right of a horse"}
+{"tag": "position", "include": [{"class": "spoon", "count": 1}, {"class": "bear", "count": 1, "position": ["above", 0]}], "prompt": "a photo of a bear above a spoon"}
+{"tag": "position", "include": [{"class": "bed", "count": 1}, {"class": "zebra", "count": 1, "position": ["right of", 0]}], "prompt": "a photo of a zebra right of a bed"}
+{"tag": "position", "include": [{"class": "laptop", "count": 1}, {"class": "cow", "count": 1, "position": ["right of", 0]}], "prompt": "a photo of a cow right of a laptop"}
+{"tag": "position", "include": [{"class": "frisbee", "count": 1}, {"class": "bed", "count": 1, "position": ["right of", 0]}], "prompt": "a photo of a bed right of a frisbee"}
+{"tag": "position", "include": [{"class": "motorcycle", "count": 1}, {"class": "tie", "count": 1, "position": ["right of", 0]}], "prompt": "a photo of a tie right of a motorcycle"}
+{"tag": "position", "include": [{"class": "tv", "count": 1}, {"class": "laptop", "count": 1, "position": ["right of", 0]}], "prompt": "a photo of a laptop right of a tv"}
+{"tag": "position", "include": [{"class": "chair", "count": 1}, {"class": "cell phone", "count": 1, "position": ["right of", 0]}], "prompt": "a photo of a cell phone right of a chair"}
+{"tag": "position", "include": [{"class": "potted plant", "count": 1}, {"class": "couch", "count": 1, "position": ["below", 0]}], "prompt": "a photo of a couch below a potted plant"}
+{"tag": "position", "include": [{"class": "tv", "count": 1}, {"class": "clock", "count": 1, "position": ["below", 0]}], "prompt": "a photo of a clock below a tv"}
+{"tag": "position", "include": [{"class": "vase", "count": 1}, {"class": "couch", "count": 1, "position": ["below", 0]}], "prompt": "a photo of a couch below a vase"}
+{"tag": "position", "include": [{"class": "cat", "count": 1}, {"class": "donut", "count": 1, "position": ["below", 0]}], "prompt": "a photo of a donut below a cat"}
+{"tag": "position", "include": [{"class": "toaster", "count": 1}, {"class": "couch", "count": 1, "position": ["left of", 0]}], "prompt": "a photo of a couch left of a toaster"}
+{"tag": "color_attr", "include": [{"class": "wine glass", "count": 1, "color": "purple"}, {"class": "apple", "count": 1, "color": "black"}], "prompt": "a photo of a purple wine glass and a black apple"}
+{"tag": "color_attr", "include": [{"class": "bus", "count": 1, "color": "green"}, {"class": "microwave", "count": 1, "color": "purple"}], "prompt": "a photo of a green bus and a purple microwave"}
+{"tag": "color_attr", "include": [{"class": "skis", "count": 1, "color": "green"}, {"class": "airplane", "count": 1, "color": "brown"}], "prompt": "a photo of a green skis and a brown airplane"}
+{"tag": "color_attr", "include": [{"class": "computer keyboard", "count": 1, "color": "yellow"}, {"class": "sink", "count": 1, "color": "black"}], "prompt": "a photo of a yellow computer keyboard and a black sink"}
+{"tag": "color_attr", "include": [{"class": "oven", "count": 1, "color": "pink"}, {"class": "motorcycle", "count": 1, "color": "green"}], "prompt": "a photo of a pink oven and a green motorcycle"}
+{"tag": "color_attr", "include": [{"class": "parking meter", "count": 1, "color": "purple"}, {"class": "laptop", "count": 1, "color": "red"}], "prompt": "a photo of a purple parking meter and a red laptop"}
+{"tag": "color_attr", "include": [{"class": "skateboard", "count": 1, "color": "yellow"}, {"class": "computer mouse", "count": 1, "color": "orange"}], "prompt": "a photo of a yellow skateboard and an orange computer mouse"}
+{"tag": "color_attr", "include": [{"class": "skis", "count": 1, "color": "red"}, {"class": "tie", "count": 1, "color": "brown"}], "prompt": "a photo of a red skis and a brown tie"}
+{"tag": "color_attr", "include": [{"class": "skateboard", "count": 1, "color": "pink"}, {"class": "train", "count": 1, "color": "black"}], "prompt": "a photo of a pink skateboard and a black train"}
+{"tag": "color_attr", "include": [{"class": "handbag", "count": 1, "color": "white"}, {"class": "bed", "count": 1, "color": "purple"}], "prompt": "a photo of a white handbag and a purple bed"}
+{"tag": "color_attr", "include": [{"class": "elephant", "count": 1, "color": "purple"}, {"class": "sports ball", "count": 1, "color": "brown"}], "prompt": "a photo of a purple elephant and a brown sports ball"}
+{"tag": "color_attr", "include": [{"class": "dog", "count": 1, "color": "purple"}, {"class": "dining table", "count": 1, "color": "black"}], "prompt": "a photo of a purple dog and a black dining table"}
+{"tag": "color_attr", "include": [{"class": "dining table", "count": 1, "color": "white"}, {"class": "car", "count": 1, "color": "red"}], "prompt": "a photo of a white dining table and a red car"}
+{"tag": "color_attr", "include": [{"class": "cell phone", "count": 1, "color": "blue"}, {"class": "apple", "count": 1, "color": "green"}], "prompt": "a photo of a blue cell phone and a green apple"}
+{"tag": "color_attr", "include": [{"class": "car", "count": 1, "color": "red"}, {"class": "potted plant", "count": 1, "color": "orange"}], "prompt": "a photo of a red car and an orange potted plant"}
+{"tag": "color_attr", "include": [{"class": "carrot", "count": 1, "color": "brown"}, {"class": "potted plant", "count": 1, "color": "white"}], "prompt": "a photo of a brown carrot and a white potted plant"}
+{"tag": "color_attr", "include": [{"class": "kite", "count": 1, "color": "black"}, {"class": "bear", "count": 1, "color": "green"}], "prompt": "a photo of a black kite and a green bear"}
+{"tag": "color_attr", "include": [{"class": "laptop", "count": 1, "color": "blue"}, {"class": "bear", "count": 1, "color": "brown"}], "prompt": "a photo of a blue laptop and a brown bear"}
+{"tag": "color_attr", "include": [{"class": "teddy bear", "count": 1, "color": "green"}, {"class": "kite", "count": 1, "color": "brown"}], "prompt": "a photo of a green teddy bear and a brown kite"}
+{"tag": "color_attr", "include": [{"class": "stop sign", "count": 1, "color": "yellow"}, {"class": "potted plant", "count": 1, "color": "blue"}], "prompt": "a photo of a yellow stop sign and a blue potted plant"}
+{"tag": "color_attr", "include": [{"class": "snowboard", "count": 1, "color": "orange"}, {"class": "cat", "count": 1, "color": "green"}], "prompt": "a photo of an orange snowboard and a green cat"}
+{"tag": "color_attr", "include": [{"class": "truck", "count": 1, "color": "orange"}, {"class": "sink", "count": 1, "color": "pink"}], "prompt": "a photo of an orange truck and a pink sink"}
+{"tag": "color_attr", "include": [{"class": "hot dog", "count": 1, "color": "brown"}, {"class": "pizza", "count": 1, "color": "purple"}], "prompt": "a photo of a brown hot dog and a purple pizza"}
+{"tag": "color_attr", "include": [{"class": "couch", "count": 1, "color": "green"}, {"class": "umbrella", "count": 1, "color": "orange"}], "prompt": "a photo of a green couch and an orange umbrella"}
+{"tag": "color_attr", "include": [{"class": "bed", "count": 1, "color": "brown"}, {"class": "cell phone", "count": 1, "color": "pink"}], "prompt": "a photo of a brown bed and a pink cell phone"}
+{"tag": "color_attr", "include": [{"class": "broccoli", "count": 1, "color": "black"}, {"class": "cake", "count": 1, "color": "yellow"}], "prompt": "a photo of a black broccoli and a yellow cake"}
+{"tag": "color_attr", "include": [{"class": "train", "count": 1, "color": "red"}, {"class": "bear", "count": 1, "color": "purple"}], "prompt": "a photo of a red train and a purple bear"}
+{"tag": "color_attr", "include": [{"class": "tennis racket", "count": 1, "color": "purple"}, {"class": "sink", "count": 1, "color": "black"}], "prompt": "a photo of a purple tennis racket and a black sink"}
+{"tag": "color_attr", "include": [{"class": "vase", "count": 1, "color": "blue"}, {"class": "banana", "count": 1, "color": "black"}], "prompt": "a photo of a blue vase and a black banana"}
+{"tag": "color_attr", "include": [{"class": "clock", "count": 1, "color": "blue"}, {"class": "cup", "count": 1, "color": "white"}], "prompt": "a photo of a blue clock and a white cup"}
+{"tag": "color_attr", "include": [{"class": "umbrella", "count": 1, "color": "red"}, {"class": "couch", "count": 1, "color": "blue"}], "prompt": "a photo of a red umbrella and a blue couch"}
+{"tag": "color_attr", "include": [{"class": "handbag", "count": 1, "color": "white"}, {"class": "giraffe", "count": 1, "color": "red"}], "prompt": "a photo of a white handbag and a red giraffe"}
+{"tag": "color_attr", "include": [{"class": "tv remote", "count": 1, "color": "pink"}, {"class": "airplane", "count": 1, "color": "blue"}], "prompt": "a photo of a pink tv remote and a blue airplane"}
+{"tag": "color_attr", "include": [{"class": "handbag", "count": 1, "color": "pink"}, {"class": "scissors", "count": 1, "color": "black"}], "prompt": "a photo of a pink handbag and a black scissors"}
+{"tag": "color_attr", "include": [{"class": "car", "count": 1, "color": "brown"}, {"class": "hair drier", "count": 1, "color": "pink"}], "prompt": "a photo of a brown car and a pink hair drier"}
+{"tag": "color_attr", "include": [{"class": "bus", "count": 1, "color": "black"}, {"class": "cell phone", "count": 1, "color": "brown"}], "prompt": "a photo of a black bus and a brown cell phone"}
+{"tag": "color_attr", "include": [{"class": "sheep", "count": 1, "color": "purple"}, {"class": "banana", "count": 1, "color": "pink"}], "prompt": "a photo of a purple sheep and a pink banana"}
+{"tag": "color_attr", "include": [{"class": "handbag", "count": 1, "color": "blue"}, {"class": "cell phone", "count": 1, "color": "white"}], "prompt": "a photo of a blue handbag and a white cell phone"}
+{"tag": "color_attr", "include": [{"class": "pizza", "count": 1, "color": "white"}, {"class": "umbrella", "count": 1, "color": "green"}], "prompt": "a photo of a white pizza and a green umbrella"}
+{"tag": "color_attr", "include": [{"class": "tie", "count": 1, "color": "white"}, {"class": "skateboard", "count": 1, "color": "purple"}], "prompt": "a photo of a white tie and a purple skateboard"}
+{"tag": "color_attr", "include": [{"class": "sports ball", "count": 1, "color": "yellow"}, {"class": "boat", "count": 1, "color": "green"}], "prompt": "a photo of a yellow sports ball and a green boat"}
+{"tag": "color_attr", "include": [{"class": "wine glass", "count": 1, "color": "white"}, {"class": "giraffe", "count": 1, "color": "brown"}], "prompt": "a photo of a white wine glass and a brown giraffe"}
+{"tag": "color_attr", "include": [{"class": "bowl", "count": 1, "color": "yellow"}, {"class": "baseball glove", "count": 1, "color": "white"}], "prompt": "a photo of a yellow bowl and a white baseball glove"}
+{"tag": "color_attr", "include": [{"class": "microwave", "count": 1, "color": "orange"}, {"class": "spoon", "count": 1, "color": "black"}], "prompt": "a photo of an orange microwave and a black spoon"}
+{"tag": "color_attr", "include": [{"class": "skateboard", "count": 1, "color": "orange"}, {"class": "bowl", "count": 1, "color": "pink"}], "prompt": "a photo of an orange skateboard and a pink bowl"}
+{"tag": "color_attr", "include": [{"class": "toilet", "count": 1, "color": "blue"}, {"class": "suitcase", "count": 1, "color": "white"}], "prompt": "a photo of a blue toilet and a white suitcase"}
+{"tag": "color_attr", "include": [{"class": "boat", "count": 1, "color": "white"}, {"class": "hot dog", "count": 1, "color": "orange"}], "prompt": "a photo of a white boat and an orange hot dog"}
+{"tag": "color_attr", "include": [{"class": "dining table", "count": 1, "color": "yellow"}, {"class": "dog", "count": 1, "color": "pink"}], "prompt": "a photo of a yellow dining table and a pink dog"}
+{"tag": "color_attr", "include": [{"class": "cake", "count": 1, "color": "red"}, {"class": "chair", "count": 1, "color": "purple"}], "prompt": "a photo of a red cake and a purple chair"}
+{"tag": "color_attr", "include": [{"class": "tie", "count": 1, "color": "blue"}, {"class": "dining table", "count": 1, "color": "pink"}], "prompt": "a photo of a blue tie and a pink dining table"}
+{"tag": "color_attr", "include": [{"class": "cow", "count": 1, "color": "blue"}, {"class": "computer keyboard", "count": 1, "color": "black"}], "prompt": "a photo of a blue cow and a black computer keyboard"}
+{"tag": "color_attr", "include": [{"class": "pizza", "count": 1, "color": "yellow"}, {"class": "oven", "count": 1, "color": "green"}], "prompt": "a photo of a yellow pizza and a green oven"}
+{"tag": "color_attr", "include": [{"class": "laptop", "count": 1, "color": "red"}, {"class": "car", "count": 1, "color": "brown"}], "prompt": "a photo of a red laptop and a brown car"}
+{"tag": "color_attr", "include": [{"class": "computer keyboard", "count": 1, "color": "purple"}, {"class": "scissors", "count": 1, "color": "blue"}], "prompt": "a photo of a purple computer keyboard and a blue scissors"}
+{"tag": "color_attr", "include": [{"class": "surfboard", "count": 1, "color": "green"}, {"class": "oven", "count": 1, "color": "orange"}], "prompt": "a photo of a green surfboard and an orange oven"}
+{"tag": "color_attr", "include": [{"class": "parking meter", "count": 1, "color": "yellow"}, {"class": "refrigerator", "count": 1, "color": "pink"}], "prompt": "a photo of a yellow parking meter and a pink refrigerator"}
+{"tag": "color_attr", "include": [{"class": "computer mouse", "count": 1, "color": "brown"}, {"class": "bottle", "count": 1, "color": "purple"}], "prompt": "a photo of a brown computer mouse and a purple bottle"}
+{"tag": "color_attr", "include": [{"class": "umbrella", "count": 1, "color": "red"}, {"class": "cow", "count": 1, "color": "green"}], "prompt": "a photo of a red umbrella and a green cow"}
+{"tag": "color_attr", "include": [{"class": "giraffe", "count": 1, "color": "red"}, {"class": "cell phone", "count": 1, "color": "black"}], "prompt": "a photo of a red giraffe and a black cell phone"}
+{"tag": "color_attr", "include": [{"class": "oven", "count": 1, "color": "brown"}, {"class": "train", "count": 1, "color": "purple"}], "prompt": "a photo of a brown oven and a purple train"}
+{"tag": "color_attr", "include": [{"class": "baseball bat", "count": 1, "color": "blue"}, {"class": "book", "count": 1, "color": "pink"}], "prompt": "a photo of a blue baseball bat and a pink book"}
+{"tag": "color_attr", "include": [{"class": "cup", "count": 1, "color": "green"}, {"class": "bowl", "count": 1, "color": "yellow"}], "prompt": "a photo of a green cup and a yellow bowl"}
+{"tag": "color_attr", "include": [{"class": "suitcase", "count": 1, "color": "yellow"}, {"class": "bus", "count": 1, "color": "brown"}], "prompt": "a photo of a yellow suitcase and a brown bus"}
+{"tag": "color_attr", "include": [{"class": "motorcycle", "count": 1, "color": "orange"}, {"class": "donut", "count": 1, "color": "pink"}], "prompt": "a photo of an orange motorcycle and a pink donut"}
+{"tag": "color_attr", "include": [{"class": "giraffe", "count": 1, "color": "orange"}, {"class": "baseball glove", "count": 1, "color": "white"}], "prompt": "a photo of an orange giraffe and a white baseball glove"}
+{"tag": "color_attr", "include": [{"class": "handbag", "count": 1, "color": "orange"}, {"class": "carrot", "count": 1, "color": "green"}], "prompt": "a photo of an orange handbag and a green carrot"}
+{"tag": "color_attr", "include": [{"class": "bottle", "count": 1, "color": "black"}, {"class": "refrigerator", "count": 1, "color": "white"}], "prompt": "a photo of a black bottle and a white refrigerator"}
+{"tag": "color_attr", "include": [{"class": "dog", "count": 1, "color": "white"}, {"class": "potted plant", "count": 1, "color": "blue"}], "prompt": "a photo of a white dog and a blue potted plant"}
+{"tag": "color_attr", "include": [{"class": "handbag", "count": 1, "color": "orange"}, {"class": "car", "count": 1, "color": "red"}], "prompt": "a photo of an orange handbag and a red car"}
+{"tag": "color_attr", "include": [{"class": "stop sign", "count": 1, "color": "red"}, {"class": "book", "count": 1, "color": "blue"}], "prompt": "a photo of a red stop sign and a blue book"}
+{"tag": "color_attr", "include": [{"class": "car", "count": 1, "color": "yellow"}, {"class": "toothbrush", "count": 1, "color": "orange"}], "prompt": "a photo of a yellow car and an orange toothbrush"}
+{"tag": "color_attr", "include": [{"class": "potted plant", "count": 1, "color": "black"}, {"class": "toilet", "count": 1, "color": "yellow"}], "prompt": "a photo of a black potted plant and a yellow toilet"}
+{"tag": "color_attr", "include": [{"class": "dining table", "count": 1, "color": "brown"}, {"class": "suitcase", "count": 1, "color": "white"}], "prompt": "a photo of a brown dining table and a white suitcase"}
+{"tag": "color_attr", "include": [{"class": "donut", "count": 1, "color": "orange"}, {"class": "stop sign", "count": 1, "color": "yellow"}], "prompt": "a photo of an orange donut and a yellow stop sign"}
+{"tag": "color_attr", "include": [{"class": "suitcase", "count": 1, "color": "green"}, {"class": "boat", "count": 1, "color": "blue"}], "prompt": "a photo of a green suitcase and a blue boat"}
+{"tag": "color_attr", "include": [{"class": "tennis racket", "count": 1, "color": "orange"}, {"class": "sports ball", "count": 1, "color": "yellow"}], "prompt": "a photo of an orange tennis racket and a yellow sports ball"}
+{"tag": "color_attr", "include": [{"class": "computer keyboard", "count": 1, "color": "purple"}, {"class": "chair", "count": 1, "color": "red"}], "prompt": "a photo of a purple computer keyboard and a red chair"}
+{"tag": "color_attr", "include": [{"class": "suitcase", "count": 1, "color": "purple"}, {"class": "pizza", "count": 1, "color": "orange"}], "prompt": "a photo of a purple suitcase and an orange pizza"}
+{"tag": "color_attr", "include": [{"class": "bottle", "count": 1, "color": "white"}, {"class": "sheep", "count": 1, "color": "blue"}], "prompt": "a photo of a white bottle and a blue sheep"}
+{"tag": "color_attr", "include": [{"class": "backpack", "count": 1, "color": "purple"}, {"class": "umbrella", "count": 1, "color": "white"}], "prompt": "a photo of a purple backpack and a white umbrella"}
+{"tag": "color_attr", "include": [{"class": "potted plant", "count": 1, "color": "orange"}, {"class": "spoon", "count": 1, "color": "black"}], "prompt": "a photo of an orange potted plant and a black spoon"}
+{"tag": "color_attr", "include": [{"class": "tennis racket", "count": 1, "color": "green"}, {"class": "dog", "count": 1, "color": "black"}], "prompt": "a photo of a green tennis racket and a black dog"}
+{"tag": "color_attr", "include": [{"class": "handbag", "count": 1, "color": "yellow"}, {"class": "refrigerator", "count": 1, "color": "blue"}], "prompt": "a photo of a yellow handbag and a blue refrigerator"}
+{"tag": "color_attr", "include": [{"class": "broccoli", "count": 1, "color": "pink"}, {"class": "sink", "count": 1, "color": "red"}], "prompt": "a photo of a pink broccoli and a red sink"}
+{"tag": "color_attr", "include": [{"class": "bowl", "count": 1, "color": "red"}, {"class": "sink", "count": 1, "color": "pink"}], "prompt": "a photo of a red bowl and a pink sink"}
+{"tag": "color_attr", "include": [{"class": "toilet", "count": 1, "color": "white"}, {"class": "apple", "count": 1, "color": "red"}], "prompt": "a photo of a white toilet and a red apple"}
+{"tag": "color_attr", "include": [{"class": "dining table", "count": 1, "color": "pink"}, {"class": "sandwich", "count": 1, "color": "black"}], "prompt": "a photo of a pink dining table and a black sandwich"}
+{"tag": "color_attr", "include": [{"class": "car", "count": 1, "color": "black"}, {"class": "parking meter", "count": 1, "color": "green"}], "prompt": "a photo of a black car and a green parking meter"}
+{"tag": "color_attr", "include": [{"class": "bird", "count": 1, "color": "yellow"}, {"class": "motorcycle", "count": 1, "color": "black"}], "prompt": "a photo of a yellow bird and a black motorcycle"}
+{"tag": "color_attr", "include": [{"class": "giraffe", "count": 1, "color": "brown"}, {"class": "stop sign", "count": 1, "color": "white"}], "prompt": "a photo of a brown giraffe and a white stop sign"}
+{"tag": "color_attr", "include": [{"class": "banana", "count": 1, "color": "white"}, {"class": "elephant", "count": 1, "color": "black"}], "prompt": "a photo of a white banana and a black elephant"}
+{"tag": "color_attr", "include": [{"class": "cow", "count": 1, "color": "orange"}, {"class": "sandwich", "count": 1, "color": "purple"}], "prompt": "a photo of an orange cow and a purple sandwich"}
+{"tag": "color_attr", "include": [{"class": "clock", "count": 1, "color": "red"}, {"class": "cell phone", "count": 1, "color": "black"}], "prompt": "a photo of a red clock and a black cell phone"}
+{"tag": "color_attr", "include": [{"class": "knife", "count": 1, "color": "brown"}, {"class": "donut", "count": 1, "color": "blue"}], "prompt": "a photo of a brown knife and a blue donut"}
+{"tag": "color_attr", "include": [{"class": "cup", "count": 1, "color": "red"}, {"class": "handbag", "count": 1, "color": "pink"}], "prompt": "a photo of a red cup and a pink handbag"}
+{"tag": "color_attr", "include": [{"class": "bicycle", "count": 1, "color": "yellow"}, {"class": "motorcycle", "count": 1, "color": "red"}], "prompt": "a photo of a yellow bicycle and a red motorcycle"}
+{"tag": "color_attr", "include": [{"class": "orange", "count": 1, "color": "red"}, {"class": "broccoli", "count": 1, "color": "purple"}], "prompt": "a photo of a red orange and a purple broccoli"}
+{"tag": "color_attr", "include": [{"class": "traffic light", "count": 1, "color": "orange"}, {"class": "toilet", "count": 1, "color": "white"}], "prompt": "a photo of an orange traffic light and a white toilet"}
+{"tag": "color_attr", "include": [{"class": "cup", "count": 1, "color": "green"}, {"class": "pizza", "count": 1, "color": "red"}], "prompt": "a photo of a green cup and a red pizza"}
+{"tag": "color_attr", "include": [{"class": "pizza", "count": 1, "color": "blue"}, {"class": "baseball glove", "count": 1, "color": "yellow"}], "prompt": "a photo of a blue pizza and a yellow baseball glove"}

prompts/ocr_test.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

requirements.txt ADDED Viewed

	@@ -0,0 +1,202 @@

+accelerate==1.7.0
+aiohappyeyeballs==2.6.1
+aiohttp==3.12.9
+aiosignal==1.3.2
+airportsdata==20250523
+annotated-types==0.7.0
+anthropic==0.54.0
+antlr4-python3-runtime==4.13.2
+anyio==4.9.0
+astor==0.8.1
+asttokens==3.0.0
+attrs==25.3.0
+av==14.4.0
+bitsandbytes==0.46.0
+blake3==1.0.5
+cachetools==6.0.0
+certifi==2025.4.26
+charset-normalizer==3.4.2
+click==8.2.1
+cloudpickle==3.1.1
+compressed-tensors==0.9.4
+contourpy==1.3.2
+cupy-cuda12x==13.4.1
+cycler==0.12.1
+datasets==3.6.0
+decorator==5.2.1
+deepspeed==0.15.4
+depyf==0.18.0
+dill==0.3.8
+diskcache==5.6.3
+distro==1.9.0
+dnspython==2.7.0
+docker-pycreds==0.4.0
+einops==0.8.1
+email-validator==2.2.0
+executing==2.2.0
+fastapi==0.115.12
+fastapi-cli==0.0.7
+fastrlock==0.8.3
+filelock==3.18.0
+fonttools==4.58.4
+frozenlist==1.6.2
+fsspec==2025.3.0
+ftfy==6.3.1
+gguf==0.17.0
+gitdb==4.0.12
+gitpython==3.1.44
+googleapis-common-protos==1.70.0
+grpcio==1.72.1
+h11==0.16.0
+hf-transfer==0.1.9
+hf-xet==1.1.3
+hjson==3.1.0
+httpcore==1.0.9
+httptools==0.6.4
+httpx==0.28.1
+huggingface-hub==0.32.4
+idna==3.10
+importlib-metadata==8.7.0
+inquirerpy==0.3.4
+interegular==0.3.3
+ipython==9.3.0
+ipython-pygments-lexers==1.1.1
+jedi==0.19.2
+jinja2==3.1.6
+jiter==0.10.0
+jsonschema==4.24.0
+jsonschema-specifications==2025.4.1
+kiwisolver==1.4.8
+lark==1.2.2
+latex2sympy2-extended==1.10.1
+liger-kernel==0.5.2
+llguidance==0.7.29
+llvmlite==0.44.0
+lm-format-enforcer==0.10.11
+markdown-it-py==3.0.0
+markupsafe==3.0.2
+math-verify==0.7.0
+matplotlib==3.10.3
+matplotlib-inline==0.1.7
+mdurl==0.1.2
+mistral-common==1.5.6
+mpmath==1.3.0
+msgpack==1.1.0
+msgspec==0.19.0
+multidict==6.4.4
+multiprocess==0.70.16
+nest-asyncio==1.6.0
+networkx==3.5
+ninja==1.11.1.4
+numba==0.61.2
+numpy==2.2.6
+nvidia-cublas-cu12==12.6.4.1
+nvidia-cuda-cupti-cu12==12.6.80
+nvidia-cuda-nvrtc-cu12==12.6.77
+nvidia-cuda-runtime-cu12==12.6.77
+nvidia-cudnn-cu12==9.5.1.17
+nvidia-cufft-cu12==11.3.0.4
+nvidia-cufile-cu12==1.11.1.6
+nvidia-curand-cu12==10.3.7.77
+nvidia-cusolver-cu12==11.7.1.2
+nvidia-cusparse-cu12==12.5.4.2
+nvidia-cusparselt-cu12==0.6.3
+nvidia-nccl-cu12==2.26.2
+nvidia-nvjitlink-cu12==12.6.85
+nvidia-nvtx-cu12==12.6.77
+openai==1.84.0
+opencv-python-headless==4.11.0.86
+opentelemetry-api==1.34.0
+opentelemetry-exporter-otlp==1.34.0
+opentelemetry-exporter-otlp-proto-common==1.34.0
+opentelemetry-exporter-otlp-proto-grpc==1.34.0
+opentelemetry-exporter-otlp-proto-http==1.34.0
+opentelemetry-proto==1.34.0
+opentelemetry-sdk==1.34.0
+opentelemetry-semantic-conventions==0.55b0
+opentelemetry-semantic-conventions-ai==0.4.9
+outlines==0.1.11
+outlines-core==0.1.26
+packaging==25.0
+pandas==2.3.0
+parso==0.8.4
+partial-json-parser==0.2.1.1.post5
+peft==0.17.1
+pexpect==4.9.0
+pfzy==0.3.4
+pillow==11.2.1
+platformdirs==4.3.8
+prometheus-client==0.22.1
+prometheus-fastapi-instrumentator==7.1.0
+prompt-toolkit==3.0.51
+propcache==0.3.1
+protobuf==5.29.5
+psutil==7.0.0
+ptyprocess==0.7.0
+pure-eval==0.2.3
+py-cpuinfo==9.0.0
+pyarrow==20.0.0
+pycountry==24.6.1
+pydantic==2.11.5
+pydantic-core==2.33.2
+pygments==2.19.1
+pyparsing==3.2.3
+python-dateutil==2.9.0.post0
+python-dotenv==1.1.0
+python-json-logger==3.3.0
+python-multipart==0.0.20
+pytz==2025.2
+pyyaml==6.0.2
+pyzmq==26.4.0
+qwen-vl-utils==0.0.11
+ray==2.46.0
+referencing==0.36.2
+regex==2024.11.6
+requests==2.32.3
+rich==14.0.0
+rich-toolkit==0.14.7
+rpds-py==0.25.1
+safetensors==0.5.3
+scipy==1.15.3
+seaborn==0.13.2
+sentencepiece==0.2.0
+sentry-sdk==2.29.1
+setproctitle==1.3.6
+shellingham==1.5.4
+six==1.17.0
+smmap==5.0.2
+sniffio==1.3.1
+stack-data==0.6.3
+starlette==0.46.2
+sympy==1.14.0
+tabulate==0.9.0
+tiktoken==0.9.0
+timm==0.6.13
+tokenizers==0.21.1
+torch==2.7.0
+torchaudio==2.7.0
+torchvision==0.22.0
+tqdm==4.67.1
+traitlets==5.14.3
+transformers==4.51.3
+triton==3.3.0
+trl==0.19.0
+typer==0.16.0
+typing-extensions==4.14.0
+typing-inspection==0.4.1
+tzdata==2025.2
+urllib3==2.4.0
+utils==1.0.2
+uvicorn==0.34.3
+uvloop==0.21.0
+vllm==0.9.0.1
+wandb==0.18.3
+watchfiles==1.0.5
+wcwidth==0.2.13
+websockets==15.0.1
+xformers==0.0.30
+xgrammar==0.1.19
+xxhash==3.5.0
+yarl==1.20.0
+zipp==3.22.0
+tensorboardX==2.6.4

unified_inference.py ADDED Viewed

	@@ -0,0 +1,660 @@

+"""
+Unified Inference Script for Multi-Modal Image Generation and Editing
+Supports three modes:
+1. t2i (Text-to-Image): Generate images from text prompts (txt file)
+2. geneval: Generate multiple samples per prompt for evaluation (jsonl file)
+3. edit: Edit images based on prompts (parquet file)
+Example usage:
+    # Text-to-Image
+    python unified_inference.py --mode t2i --model_path ./model --model_type flux \
+        --prompt_file prompts.txt --output_dir outputs/t2i
+    # GenEval
+    python unified_inference.py --mode geneval --model_path ./model --model_type flux \
+        --metadata_file evaluation_metadata.jsonl --output_dir outputs/geneval --n_samples 4
+    # Image Editing
+    python unified_inference.py --mode edit --model_path ./model --model_type kontext \
+        --data_file data.parquet --output_dir outputs/edit
+"""
+from concurrent.futures import ThreadPoolExecutor, as_completed
+import argparse
+import json
+import os
+import traceback
+from tqdm import tqdm
+import torch
+import numpy as np
+from PIL import Image
+from transformers import AutoProcessor
+import random
+import multiprocessing as mp
+import pandas as pd
+from io import BytesIO
+import base64
+from torchvision import transforms as TF
+# Model imports
+from unimodel.qwenflux.qwenflux_inference import QwenFluxForInferenceLM
+from unimodel.qwenkontext.qwenkontext_inference import QwenKontextForInferenceLM
+# Global configuration
+NUM_DEVICE = 8
+NUM_PROCESSES = 8
+# =============================================================================
+# CoT Prompt Templates
+# =============================================================================
+COT_PROMPT_TEMPLATES = {
+    # General enhancement
+    "geneval": """Please provide an enhanced prompt for the following image generation prompt to make the image more realistic, detailed, with clear separation and precise alignment of all entities.
+Original prompt: {original_prompt}. Directly provide the improved prompt in <answer> </answer> tags.""",
+    "ocr_clarity_v2": """Please enhance the following image generation prompt with specific focus on TEXT clarity and readability.
+Original prompt: {original_prompt}. Directly provide the improved prompt in <answer> </answer> tags.""",
+    "quality_purev2": """Rewrite the following image generation prompt to improve its visual quality, detail level, realism, and artistic sophistication.
+Original prompt: {original_prompt}
+Directly provide the enhanced version directly in <answer></answer> tags.""",
+    "edit_general": """Please provide an enhanced prompt for the following image editing prompt.
+Ensure the revised prompt is clear, specific, and includes detailed instructions to achieve the desired outcome while maintaining the original intent.
+Original prompt: {original_prompt}. Directly provide the improved prompt in <answer> </answer> tags.""",
+}
+# =============================================================================
+# Utility Functions
+# =============================================================================
+def set_global_seed(seed):
+    """Set global random seed for reproducibility."""
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+# =============================================================================
+# Model Loading
+# =============================================================================
+def load_model_pipeline(model_path, model_type, device):
+    """Load model pipeline based on model type."""
+    processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-3B-Instruct")
+    subfolder = model_path.split('/')[-1]
+    model_path = model_path.replace(f"/{subfolder}", "")
+    if model_type == "flux":
+        model = QwenFluxForInferenceLM.from_pretrained(
+            model_path, torch_dtype=torch.bfloat16, subfolder=subfolder
+        )
+    elif model_type == "sana":
+        model = QwenSanaForInferenceLM.from_pretrained(
+            model_path, torch_dtype=torch.bfloat16, subfolder=subfolder
+        )
+    elif model_type == "sd3":
+        model = QwenSD3ForInferenceLM.from_pretrained(
+            model_path, torch_dtype=torch.bfloat16, subfolder=subfolder
+        )
+    elif model_type == "kontext":
+        model = QwenKontextForInferenceLM.from_pretrained(
+            model_path, torch_dtype=torch.bfloat16, subfolder=subfolder
+        )
+    else:
+        raise ValueError(f"Unknown model type: {model_type}")
+    processor.tokenizer.padding_side = "left"  # for batch inference
+    model.to(device)
+    return model, processor
+# =============================================================================
+# Data Loading Functions
+# =============================================================================
+def load_prompts_from_txt(txt_file):
+    """Load prompts from text file (one per line)."""
+    with open(txt_file, 'r', encoding='utf-8') as f:
+        prompts = [line.strip() for line in f if line.strip()]
+    return prompts
+def load_prompts_from_jsonl(metadata_file):
+    """Load prompts and metadata from JSONL file."""
+    with open(metadata_file) as fp:
+        metadatas = [json.loads(line) for line in fp]
+    prompts = [metadata['prompt'].strip() for metadata in metadatas]
+    return prompts, metadatas
+def load_data_from_parquet(parquet_file):
+    """Load images and prompts from parquet file."""
+    df = pd.read_parquet(parquet_file)
+    # Identify column names
+    image_col = None
+    prompt_col = None
+    id_col = None
+    for col in df.columns:
+        col_lower = col.lower()
+        if 'image' in col_lower and image_col is None:
+            image_col = col
+        elif any(kw in col_lower for kw in ['prompt', 'text', 'caption', 'instruction']) and prompt_col is None:
+            prompt_col = col
+        elif any(kw in col_lower for kw in ['id', 'index']) and id_col is None:
+            id_col = col
+    if image_col is None or prompt_col is None:
+        raise ValueError(
+            f"Cannot identify columns. Found: {df.columns.tolist()}\n"
+            f"Expected 'image' and 'prompt'/'text'/'caption'"
+        )
+    print(f"Using columns - Image: '{image_col}', Prompt: '{prompt_col}', ID: '{id_col}'")
+    data_list = []
+    for idx, row in tqdm(df.iterrows(), total=len(df), desc="Loading parquet"):
+        try:
+            image_data = row[image_col]["bytes"]
+            if isinstance(image_data, bytes):
+                image = Image.open(BytesIO(image_data)).convert('RGB')
+            elif isinstance(image_data, str):
+                if image_data.startswith('data:image') or image_data.startswith('/9j/') or image_data.startswith('iVBOR'):
+                    if 'base64,' in image_data:
+                        image_data = image_data.split('base64,')[1]
+                    image_bytes = base64.b64decode(image_data)
+                    image = Image.open(BytesIO(image_bytes)).convert('RGB')
+                else:
+                    image = Image.open(image_data).convert('RGB')
+            else:
+                print(f"Warning: Skipping row {idx} - unsupported image format")
+                continue
+            prompt = str(row[prompt_col])
+            item_id = row[id_col] if id_col else idx
+            data_list.append({
+                'image': image,
+                'prompt': prompt,
+                'id': item_id,
+                'index': idx
+            })
+        except Exception as e:
+            print(f"Error loading row {idx}: {e}")
+            continue
+    print(f"Loaded {len(data_list)} samples from parquet")
+    return data_list
+# =============================================================================
+# Image Grid Utility
+# =============================================================================
+def create_image_grid(images, rows, cols):
+    """Create a grid image from a list of images."""
+    assert len(images) == rows * cols
+    width, height = images[0].size
+    grid_width = width * cols
+    grid_height = height * rows
+    grid_image = Image.new('RGB', (grid_width, grid_height))
+    for i, image in enumerate(images):
+        x = (i % cols) * width
+        y = (i // cols) * height
+        grid_image.paste(image, (x, y))
+    return grid_image
+# =============================================================================
+# Generation Functions
+# =============================================================================
+def generate_t2i_batch(
+    prompts, start_idx, pipeline, processor, output_dir, batch_size,
+    guidance_scale, num_inference_steps, seed, use_cot, cot_template_name,
+    add_instruction, device_id
+):
+    """Generate images from text prompts (T2I mode)."""
+    os.makedirs(output_dir, exist_ok=True)
+    for i in tqdm(range(0, len(prompts), batch_size), desc=f"GPU {device_id} T2I"):
+        batch_prompts = prompts[i:i + batch_size]
+        batch_start_idx = start_idx + i
+        original_prompts = batch_prompts.copy()
+        if add_instruction:
+            batch_prompts = [
+                f"Please generate image based on the following caption: {p}"
+                for p in batch_prompts
+            ]
+        diffusion_kwargs = dict(
+            guidance_scale=guidance_scale,
+            num_inference_steps=num_inference_steps,
+            num_images_per_prompt=1,
+            generator=torch.Generator("cpu").manual_seed(seed)
+        )
+        try:
+            with torch.no_grad():
+                if use_cot:
+                    llm_kwargs = dict(
+                        max_new_tokens=256, temperature=0.7, top_p=0.9,
+                        do_sample=False, num_return_sequences=1
+                    )
+                    cot_template = COT_PROMPT_TEMPLATES.get(cot_template_name)
+                    outputs = pipeline.generate_image_cot(
+                        texts=batch_prompts,
+                        diffusion_kwargs=diffusion_kwargs,
+                        processor=processor,
+                        llm_kwargs=llm_kwargs,
+                        cot_prompt_template=cot_template
+                    )
+                    images = outputs["images"]
+                    thinking_prompts = outputs.get("improved_prompts", [])
+                else:
+                    images = pipeline.generate_image(
+                        texts=batch_prompts,
+                        diffusion_kwargs=diffusion_kwargs
+                    )
+                    thinking_prompts = []
+            for j, img in enumerate(images):
+                img_idx = batch_start_idx + j
+                base_name = f"{img_idx:05d}"
+                img.save(os.path.join(output_dir, f"{base_name}.png"))
+                with open(os.path.join(output_dir, f"{base_name}_caption.txt"), 'w', encoding='utf-8') as f:
+                    f.write(original_prompts[j])
+                if use_cot and j < len(thinking_prompts):
+                    with open(os.path.join(output_dir, f"{base_name}_thinking.txt"), 'w', encoding='utf-8') as f:
+                        f.write(thinking_prompts[j])
+        except Exception as e:
+            print(f"Error at batch {batch_start_idx}: {e}")
+            traceback.print_exc()
+def generate_geneval_batch(
+    prompts, metadatas, start_idx, pipeline, processor, output_dir, batch_size,
+    guidance_scale, num_inference_steps, seed, n_samples, use_cot,
+    cot_template_name, skip_grid, device_id
+):
+    """Generate multiple samples per prompt for evaluation (GenEval mode)."""
+    for prompt_idx, (prompt, metadata) in enumerate(zip(prompts, metadatas)):
+        global_idx = start_idx + prompt_idx
+        outpath = os.path.join(output_dir, f"{device_id}_{prompt_idx:0>5}")
+        os.makedirs(outpath, exist_ok=True)
+        sample_path = os.path.join(outpath, "samples")
+        os.makedirs(sample_path, exist_ok=True)
+        with open(os.path.join(outpath, "metadata.jsonl"), "w") as fp:
+            json.dump(metadata, fp)
+        sample_count = 0
+        all_samples = []
+        enhanced_prompts = []
+        total_batches = (n_samples + batch_size - 1) // batch_size
+        for batch_idx in tqdm(range(total_batches), desc=f"GPU {device_id} prompt {prompt_idx}"):
+            num_images = min(batch_size, n_samples - sample_count)
+            diffusion_kwargs = dict(
+                guidance_scale=guidance_scale,
+                num_inference_steps=num_inference_steps,
+                num_images_per_prompt=num_images,
+                generator=torch.Generator("cpu").manual_seed(seed)
+            )
+            try:
+                with torch.inference_mode():
+                    if use_cot:
+                        llm_kwargs = dict(
+                            max_new_tokens=256, temperature=0.7, top_p=0.9,
+                            do_sample=False, num_return_sequences=1
+                        )
+                        cot_template = COT_PROMPT_TEMPLATES.get(cot_template_name)
+                        outputs = pipeline.generate_image_cot(
+                            texts=prompt,
+                            diffusion_kwargs=diffusion_kwargs,
+                            processor=processor,
+                            llm_kwargs=llm_kwargs,
+                            cot_prompt_template=cot_template
+                        )
+                        images = outputs["images"]
+                        enhanced_prompts.extend(outputs.get("improved_prompts", []))
+                    else:
+                        images = pipeline.generate_image(
+                            texts=prompt,
+                            diffusion_kwargs=diffusion_kwargs
+                        )
+                for img in images:
+                    img.save(os.path.join(sample_path, f"{sample_count:05}.png"))
+                    sample_count += 1
+                    if not skip_grid:
+                        all_samples.append(img)
+            except Exception as e:
+                print(f"Error at prompt {prompt_idx}, batch {batch_idx}: {e}")
+                traceback.print_exc()
+        # Save enhanced prompts
+        with open(os.path.join(outpath, "thinking_prompts.txt"), "w") as fp:
+            for ep in enhanced_prompts:
+                fp.write(f"{ep}\n")
+        # Create grid
+        if not skip_grid and all_samples:
+            rows = int(np.sqrt(n_samples))
+            cols = (n_samples + rows - 1) // rows
+            if rows * cols >= len(all_samples):
+                grid_image = create_image_grid(all_samples[:rows * cols], rows, cols)
+                grid_image.save(os.path.join(outpath, "grid.jpg"))
+def generate_edit_batch(
+    data_batch, start_idx, pipeline, processor, output_dir, batch_size,
+    guidance_scale, num_inference_steps, seed, use_cot, cot_template_name,
+    device_id, resolution
+):
+    """Edit images based on prompts (Edit mode)."""
+    os.makedirs(output_dir, exist_ok=True)
+    transform = TF.Compose([
+        TF.Resize(resolution),
+        TF.CenterCrop(resolution)
+    ])
+    for i in tqdm(range(0, len(data_batch), batch_size), desc=f"GPU {device_id} Edit"):
+        batch_data = data_batch[i:i + batch_size]
+        batch_start_idx = start_idx + i
+        batch_images = [transform(item['image']) for item in batch_data]
+        batch_prompts = [item['prompt'] for item in batch_data]
+        batch_ids = [item['id'] for item in batch_data]
+        diffusion_kwargs = dict(
+            guidance_scale=guidance_scale,
+            num_inference_steps=num_inference_steps,
+            num_images_per_prompt=1,
+            generator=torch.Generator("cpu").manual_seed(seed),
+            max_area=resolution ** 2
+        )
+        try:
+            with torch.no_grad():
+                if use_cot:
+                    llm_kwargs = dict(
+                        max_new_tokens=256, temperature=0.7, top_p=0.9,
+                        do_sample=False, num_return_sequences=1
+                    )
+                    cot_template = COT_PROMPT_TEMPLATES.get(cot_template_name)
+                    outputs = pipeline.generate_image_cot(
+                        images=batch_images,
+                        texts=batch_prompts,
+                        diffusion_kwargs=diffusion_kwargs,
+                        processor=processor,
+                        llm_kwargs=llm_kwargs,
+                        cot_prompt_template=cot_template
+                    )
+                    edited_images = outputs["images"]
+                    improved_prompts = outputs.get("improved_prompts", [])
+                else:
+                    edited_images = pipeline.generate_image(
+                        images=batch_images,
+                        texts=batch_prompts,
+                        diffusion_kwargs=diffusion_kwargs
+                    )
+                    improved_prompts = []
+            for j, (edited_img, ref_img) in enumerate(zip(edited_images, batch_images)):
+                item_id = batch_ids[j]
+                base_name = f"{item_id}"
+                edited_img.save(os.path.join(output_dir, f"{base_name}_edited.png"))
+                ref_img.save(os.path.join(output_dir, f"{base_name}_reference.png"))
+                with open(os.path.join(output_dir, f"{base_name}_prompt.txt"), 'w', encoding='utf-8') as f:
+                    f.write(batch_prompts[j])
+                if use_cot and j < len(improved_prompts):
+                    with open(os.path.join(output_dir, f"{base_name}_improved_prompt.txt"), 'w', encoding='utf-8') as f:
+                        f.write(improved_prompts[j])
+        except Exception as e:
+            print(f"Error at batch {batch_start_idx}: {e}")
+            traceback.print_exc()
+# =============================================================================
+# Worker Process
+# =============================================================================
+def worker_process(
+    device_id, mode, data, start_idx, pipeline, processor, output_dir,
+    batch_size, guidance_scale, num_inference_steps, seed, use_cot,
+    cot_template_name, add_instruction, n_samples, skip_grid, resolution, metadatas=None
+):
+    """Single GPU worker process."""
+    torch.cuda.set_device(f"cuda:{device_id % NUM_DEVICE}")
+    print(f"GPU {device_id}: Processing {len(data)} items (indices {start_idx} to {start_idx + len(data) - 1})")
+    if mode == "t2i":
+        generate_t2i_batch(
+            prompts=data, start_idx=start_idx, pipeline=pipeline,
+            processor=processor, output_dir=output_dir, batch_size=batch_size,
+            guidance_scale=guidance_scale, num_inference_steps=num_inference_steps,
+            seed=seed, use_cot=use_cot, cot_template_name=cot_template_name,
+            add_instruction=add_instruction, device_id=device_id
+        )
+    elif mode == "geneval":
+        generate_geneval_batch(
+            prompts=data, metadatas=metadatas, start_idx=start_idx,
+            pipeline=pipeline, processor=processor, output_dir=output_dir,
+            batch_size=batch_size, guidance_scale=guidance_scale,
+            num_inference_steps=num_inference_steps, seed=seed,
+            n_samples=n_samples, use_cot=use_cot, cot_template_name=cot_template_name,
+            skip_grid=skip_grid, device_id=device_id
+        )
+    elif mode == "edit":
+        generate_edit_batch(
+            data_batch=data, start_idx=start_idx, pipeline=pipeline,
+            processor=processor, output_dir=output_dir, batch_size=batch_size,
+            guidance_scale=guidance_scale, num_inference_steps=num_inference_steps,
+            seed=seed, use_cot=use_cot, cot_template_name=cot_template_name,
+            device_id=device_id, resolution=resolution
+        )
+    print(f"GPU {device_id}: Completed!")
+# =============================================================================
+# Argument Parser
+# =============================================================================
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="Unified Inference Script for Image Generation and Editing"
+    )
+    # Mode selection
+    parser.add_argument(
+        "--mode", type=str, required=True,
+        choices=["t2i", "geneval", "edit"],
+        help="Inference mode: t2i (text-to-image), geneval (evaluation), edit (image editing)"
+    )
+    # Input/Output
+    parser.add_argument("--prompt_file", type=str, help="Text file with prompts (for t2i mode)")
+    parser.add_argument("--metadata_file", type=str, help="JSONL metadata file (for geneval mode)")
+    parser.add_argument("--data_file", type=str, help="Parquet file with images and prompts (for edit mode)")
+    parser.add_argument("--output_dir", type=str, default="outputs", help="Output directory")
+    # Model configuration
+    parser.add_argument("--model_path", type=str, required=True, help="Model path")
+    parser.add_argument(
+        "--model_type", type=str, default="flux",
+        choices=["flux", "sana", "sd3", "kontext"],
+        help="Model type"
+    )
+    # Generation parameters
+    parser.add_argument("--batch_size", type=int, default=8, help="Batch size")
+    parser.add_argument("--resolution", type=int, default=1024, help="Image resolution")
+    parser.add_argument("--guidance_scale", type=float, default=3.5, help="CFG guidance scale")
+    parser.add_argument("--num_inference_steps", type=int, default=40, help="Inference steps")
+    parser.add_argument("--seed", type=int, default=42, help="Random seed")
+    # CoT options
+    parser.add_argument("--use_cot", action="store_true", help="Use Chain of Thought")
+    parser.add_argument(
+        "--cot_template", type=str, default="general",
+        choices=list(COT_PROMPT_TEMPLATES.keys()),
+        help="CoT prompt template"
+    )
+    parser.add_argument("--add_instruction", action="store_true", help="Add instruction prefix (t2i mode)")
+    # GenEval specific
+    parser.add_argument("--n_samples", type=int, default=4, help="Samples per prompt (geneval mode)")
+    parser.add_argument("--skip_grid", action="store_true", help="Skip grid image (geneval mode)")
+    # Hardware
+    parser.add_argument("--num_gpus", type=int, default=None, help="Number of GPUs to use")
+    parser.add_argument("--max_samples", type=int, default=None, help="Max samples to process")
+    return parser.parse_args()
+# =============================================================================
+# Main Function
+# =============================================================================
+def main():
+    mp.set_start_method('spawn', force=True)
+    args = parse_args()
+    global NUM_PROCESSES
+    if args.num_gpus is not None:
+        NUM_PROCESSES = min(args.num_gpus, NUM_DEVICE)
+    # Validate mode-specific arguments
+    if args.mode == "t2i" and not args.prompt_file:
+        raise ValueError("--prompt_file is required for t2i mode")
+    if args.mode == "geneval" and not args.metadata_file:
+        raise ValueError("--metadata_file is required for geneval mode")
+    if args.mode == "edit" and not args.data_file:
+        raise ValueError("--data_file is required for edit mode")
+    if args.mode == "edit" and args.model_type != "kontext":
+        print(f"Warning: edit mode typically uses kontext model, but got {args.model_type}")
+    # Load data based on mode
+    print(f"Mode: {args.mode}")
+    metadatas = None
+    if args.mode == "t2i":
+        print(f"Loading prompts from {args.prompt_file}...")
+        data = load_prompts_from_txt(args.prompt_file)
+    elif args.mode == "geneval":
+        print(f"Loading metadata from {args.metadata_file}...")
+        data, metadatas = load_prompts_from_jsonl(args.metadata_file)
+    elif args.mode == "edit":
+        print(f"Loading data from {args.data_file}...")
+        data = load_data_from_parquet(args.data_file)
+    # Apply max_samples limit
+    if args.max_samples is not None:
+        if args.mode == "geneval":
+            data = data[:args.max_samples]
+            metadatas = metadatas[:args.max_samples]
+        else:
+            data = data[:args.max_samples]
+        print(f"Limited to {len(data)} samples")
+    print(f"Total samples: {len(data)}")
+    # Create output directory
+    os.makedirs(args.output_dir, exist_ok=True)
+    # Save configuration
+    config_path = os.path.join(args.output_dir, "config.json")
+    config_dict = vars(args).copy()
+    with open(config_path, 'w') as f:
+        json.dump(config_dict, f, indent=2)
+    print(f"Config saved to {config_path}")
+    # Load models
+    print("Loading models...")
+    pipelines = []
+    processors = []
+    for i in range(NUM_DEVICE):
+        print(f"Loading model {i+1}/{NUM_DEVICE} on cuda:{i % NUM_DEVICE}...")
+        pipeline, processor = load_model_pipeline(
+            args.model_path, args.model_type, f"cuda:{i % NUM_DEVICE}"
+        )
+        pipelines.append(pipeline)
+        processors.append(processor)
+    print("All models loaded!")
+    # Distribute data across GPUs
+    samples_per_gpu = len(data) // NUM_PROCESSES
+    with ThreadPoolExecutor(max_workers=NUM_PROCESSES) as executor:
+        futures = []
+        for device_id in range(NUM_PROCESSES):
+            start_idx = device_id * samples_per_gpu
+            end_idx = len(data) if device_id == NUM_PROCESSES - 1 else start_idx + samples_per_gpu
+            gpu_data = data[start_idx:end_idx]
+            gpu_metadatas = metadatas[start_idx:end_idx] if metadatas else None
+            future = executor.submit(
+                worker_process,
+                device_id=device_id,
+                mode=args.mode,
+                data=gpu_data,
+                start_idx=start_idx,
+                pipeline=pipelines[device_id % NUM_DEVICE],
+                processor=processors[device_id % NUM_DEVICE],
+                output_dir=args.output_dir,
+                batch_size=args.batch_size,
+                guidance_scale=args.guidance_scale,
+                num_inference_steps=args.num_inference_steps,
+                seed=args.seed,
+                use_cot=args.use_cot,
+                cot_template_name=args.cot_template,
+                add_instruction=args.add_instruction,
+                n_samples=args.n_samples,
+                skip_grid=args.skip_grid,
+                resolution=args.resolution,
+                metadatas=gpu_metadatas
+            )
+            futures.append(future)
+        for future in as_completed(futures):
+            try:
+                future.result()
+            except Exception as e:
+                print(f"Worker failed: {e}")
+                traceback.print_exc()
+    print(f"\n✓ Done! Results saved to {args.output_dir}")
+    print(f"  Total processed: {len(data)}")
+if __name__ == "__main__":
+    main()

unimodel/qwenflux/fluxpipeline.py ADDED Viewed

	@@ -0,0 +1,1543 @@

+# Copyright 2024 Black Forest Labs and The HuggingFace Team. All rights reserved.
+# Copyright 2025 Fu-Yun Wang
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Union, Tuple
+import numpy as np
+import torch
+from transformers import (
+    CLIPImageProcessor,
+    CLIPTextModel,
+    CLIPTokenizer,
+    CLIPVisionModelWithProjection,
+    T5EncoderModel,
+    T5TokenizerFast,
+)
+from diffusers.image_processor import PipelineImageInput, VaeImageProcessor
+from diffusers.loaders import FluxIPAdapterMixin, FluxLoraLoaderMixin, FromSingleFileMixin, TextualInversionLoaderMixin
+from diffusers.models import AutoencoderKL, FluxTransformer2DModel
+from diffusers.schedulers import FlowMatchEulerDiscreteScheduler
+from diffusers.schedulers.scheduling_flow_match_euler_discrete import FlowMatchEulerDiscreteSchedulerOutput
+from diffusers.utils import (
+    USE_PEFT_BACKEND,
+    is_torch_xla_available,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from diffusers.utils.torch_utils import randn_tensor
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+from diffusers.pipelines.flux.pipeline_output import FluxPipelineOutput
+import math
+if is_torch_xla_available():
+    import torch_xla.core.xla_model as xm
+    XLA_AVAILABLE = True
+else:
+    XLA_AVAILABLE = False
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import FluxPipeline
+        >>> pipe = FluxPipeline.from_pretrained("black-forest-labs/FLUX.1-schnell", torch_dtype=torch.bfloat16)
+        >>> pipe.to("cuda")
+        >>> prompt = "A cat holding a sign that says hello world"
+        >>> # Depending on the variant being used, the pipeline call will slightly vary.
+        >>> # Refer to the pipeline documentation for more details.
+        >>> image = pipe(prompt, num_inference_steps=4, guidance_scale=0.0).images[0]
+        >>> image.save("flux.png")
+        ```
+"""
+def calculate_shift(
+    image_seq_len,
+    base_seq_len: int = 256,
+    max_seq_len: int = 4096,
+    base_shift: float = 0.5,
+    max_shift: float = 1.15,
+):
+    m = (max_shift - base_shift) / (max_seq_len - base_seq_len)
+    b = base_shift - m * base_seq_len
+    mu = image_seq_len * m + b
+    return mu
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    r"""
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+class FluxPipeline(
+    DiffusionPipeline,
+    FluxLoraLoaderMixin,
+    FromSingleFileMixin,
+    TextualInversionLoaderMixin,
+    FluxIPAdapterMixin,
+):
+    r"""
+    The Flux pipeline for text-to-image generation.
+    Reference: https://blackforestlabs.ai/announcing-black-forest-labs/
+    Args:
+        transformer ([`FluxTransformer2DModel`]):
+            Conditional Transformer (MMDiT) architecture to denoise the encoded image latents.
+        scheduler ([`FlowMatchEulerDiscreteScheduler`]):
+            A scheduler to be used in combination with `transformer` to denoise the encoded image latents.
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        text_encoder_2 ([`T5EncoderModel`]):
+            [T5](https://huggingface.co/docs/transformers/en/model_doc/t5#transformers.T5EncoderModel), specifically
+            the [google/t5-v1_1-xxl](https://huggingface.co/google/t5-v1_1-xxl) variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/en/model_doc/clip#transformers.CLIPTokenizer).
+        tokenizer_2 (`T5TokenizerFast`):
+            Second Tokenizer of class
+            [T5TokenizerFast](https://huggingface.co/docs/transformers/en/model_doc/t5#transformers.T5TokenizerFast).
+    """
+    model_cpu_offload_seq = "text_encoder->text_encoder_2->image_encoder->transformer->vae"
+    _optional_components = ["image_encoder", "feature_extractor"]
+    _callback_tensor_inputs = ["latents", "prompt_embeds"]
+    def __init__(
+        self,
+        scheduler: FlowMatchEulerDiscreteScheduler,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        text_encoder_2: T5EncoderModel,
+        tokenizer_2: T5TokenizerFast,
+        transformer: FluxTransformer2DModel,
+        image_encoder: CLIPVisionModelWithProjection = None,
+        feature_extractor: CLIPImageProcessor = None,
+    ):
+        super().__init__()
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            text_encoder_2=text_encoder_2,
+            tokenizer=tokenizer,
+            tokenizer_2=tokenizer_2,
+            transformer=transformer,
+            scheduler=scheduler,
+            image_encoder=image_encoder,
+            feature_extractor=feature_extractor,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) if getattr(self, "vae", None) else 8
+        # Flux latents are turned into 2x2 patches and packed. This means the latent width and height has to be divisible
+        # by the patch size. So the vae scale factor is multiplied by the patch size to account for this
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor * 2)
+        self.tokenizer_max_length = (
+            self.tokenizer.model_max_length if hasattr(self, "tokenizer") and self.tokenizer is not None else 77
+        )
+        self.default_sample_size = 128
+    def _get_t5_prompt_embeds(
+        self,
+        prompt: Union[str, List[str]] = None,
+        num_images_per_prompt: int = 1,
+        max_sequence_length: int = 512,
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+    ):
+        device = device or self._execution_device
+        dtype = dtype or self.text_encoder.dtype
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        batch_size = len(prompt)
+        if isinstance(self, TextualInversionLoaderMixin):
+            prompt = self.maybe_convert_prompt(prompt, self.tokenizer_2)
+        text_inputs = self.tokenizer_2(
+            prompt,
+            padding="max_length",
+            max_length=max_sequence_length,
+            truncation=True,
+            return_length=False,
+            return_overflowing_tokens=False,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        untruncated_ids = self.tokenizer_2(prompt, padding="longest", return_tensors="pt").input_ids
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer_2.batch_decode(untruncated_ids[:, self.tokenizer_max_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because `max_sequence_length` is set to "
+                f" {max_sequence_length} tokens: {removed_text}"
+            )
+        prompt_embeds = self.text_encoder_2(text_input_ids.to(device), output_hidden_states=False)[0]
+        dtype = self.text_encoder_2.dtype
+        prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+        _, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings and attention mask for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+        return prompt_embeds
+    def _get_clip_prompt_embeds(
+        self,
+        prompt: Union[str, List[str]],
+        num_images_per_prompt: int = 1,
+        device: Optional[torch.device] = None,
+    ):
+        device = device or self._execution_device
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        batch_size = len(prompt)
+        if isinstance(self, TextualInversionLoaderMixin):
+            prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer_max_length,
+            truncation=True,
+            return_overflowing_tokens=False,
+            return_length=False,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer_max_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.tokenizer_max_length} tokens: {removed_text}"
+            )
+        prompt_embeds = self.text_encoder(text_input_ids.to(device), output_hidden_states=False)
+        # Use pooled output of CLIPTextModel
+        prompt_embeds = prompt_embeds.pooler_output
+        prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt)
+        prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, -1)
+        return prompt_embeds
+    def encode_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        prompt_2: Union[str, List[str]],
+        device: Optional[torch.device] = None,
+        num_images_per_prompt: int = 1,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        max_sequence_length: int = 512,
+        lora_scale: Optional[float] = None,
+    ):
+        r"""
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                used in all text-encoders
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            lora_scale (`float`, *optional*):
+                A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+        """
+        device = device or self._execution_device
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, FluxLoraLoaderMixin):
+            self._lora_scale = lora_scale
+            # dynamically adjust the LoRA scale
+            if self.text_encoder is not None and USE_PEFT_BACKEND:
+                scale_lora_layers(self.text_encoder, lora_scale)
+            if self.text_encoder_2 is not None and USE_PEFT_BACKEND:
+                scale_lora_layers(self.text_encoder_2, lora_scale)
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        if prompt_embeds is None:
+            prompt_2 = prompt_2 or prompt
+            prompt_2 = [prompt_2] if isinstance(prompt_2, str) else prompt_2
+            # We only use the pooled prompt output from the CLIPTextModel
+            pooled_prompt_embeds = self._get_clip_prompt_embeds(
+                prompt=prompt,
+                device=device,
+                num_images_per_prompt=num_images_per_prompt,
+            )
+            prompt_embeds = self._get_t5_prompt_embeds(
+                prompt=prompt_2,
+                num_images_per_prompt=num_images_per_prompt,
+                max_sequence_length=max_sequence_length,
+                device=device,
+            )
+        if self.text_encoder is not None:
+            if isinstance(self, FluxLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
+        if self.text_encoder_2 is not None:
+            if isinstance(self, FluxLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder_2, lora_scale)
+        dtype = self.text_encoder.dtype if self.text_encoder is not None else self.transformer.dtype
+        text_ids = torch.zeros(prompt_embeds.shape[1], 3).to(device=device, dtype=dtype)
+        return prompt_embeds, pooled_prompt_embeds, text_ids
+    def encode_image(self, image, device, num_images_per_prompt):
+        dtype = next(self.image_encoder.parameters()).dtype
+        if not isinstance(image, torch.Tensor):
+            image = self.feature_extractor(image, return_tensors="pt").pixel_values
+        image = image.to(device=device, dtype=dtype)
+        image_embeds = self.image_encoder(image).image_embeds
+        image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+        return image_embeds
+    def prepare_ip_adapter_image_embeds(
+        self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt
+    ):
+        image_embeds = []
+        if ip_adapter_image_embeds is None:
+            if not isinstance(ip_adapter_image, list):
+                ip_adapter_image = [ip_adapter_image]
+            if len(ip_adapter_image) != self.transformer.encoder_hid_proj.num_ip_adapters:
+                raise ValueError(
+                    f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {self.transformer.encoder_hid_proj.num_ip_adapters} IP Adapters."
+                )
+            for single_ip_adapter_image in ip_adapter_image:
+                single_image_embeds = self.encode_image(single_ip_adapter_image, device, 1)
+                image_embeds.append(single_image_embeds[None, :])
+        else:
+            if not isinstance(ip_adapter_image_embeds, list):
+                ip_adapter_image_embeds = [ip_adapter_image_embeds]
+            if len(ip_adapter_image_embeds) != self.transformer.encoder_hid_proj.num_ip_adapters:
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` must have same length as the number of IP Adapters. Got {len(ip_adapter_image_embeds)} image embeds and {self.transformer.encoder_hid_proj.num_ip_adapters} IP Adapters."
+                )
+            for single_image_embeds in ip_adapter_image_embeds:
+                image_embeds.append(single_image_embeds)
+        ip_adapter_image_embeds = []
+        for single_image_embeds in image_embeds:
+            single_image_embeds = torch.cat([single_image_embeds] * num_images_per_prompt, dim=0)
+            single_image_embeds = single_image_embeds.to(device=device)
+            ip_adapter_image_embeds.append(single_image_embeds)
+        return ip_adapter_image_embeds
+    def check_inputs(
+        self,
+        prompt,
+        prompt_2,
+        height,
+        width,
+        negative_prompt=None,
+        negative_prompt_2=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        pooled_prompt_embeds=None,
+        negative_pooled_prompt_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+        max_sequence_length=None,
+    ):
+        if height % (self.vae_scale_factor * 2) != 0 or width % (self.vae_scale_factor * 2) != 0:
+            logger.warning(
+                f"`height` and `width` have to be divisible by {self.vae_scale_factor * 2} but are {height} and {width}. Dimensions will be resized accordingly"
+            )
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt_2 is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt_2`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+        elif prompt_2 is not None and (not isinstance(prompt_2, str) and not isinstance(prompt_2, list)):
+            raise ValueError(f"`prompt_2` has to be of type `str` or `list` but is {type(prompt_2)}")
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+        elif negative_prompt_2 is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt_2`: {negative_prompt_2} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+        if prompt_embeds is not None and pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `prompt_embeds` are provided, `pooled_prompt_embeds` also have to be passed. Make sure to generate `pooled_prompt_embeds` from the same text encoder that was used to generate `prompt_embeds`."
+            )
+        if negative_prompt_embeds is not None and negative_pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `negative_prompt_embeds` are provided, `negative_pooled_prompt_embeds` also have to be passed. Make sure to generate `negative_pooled_prompt_embeds` from the same text encoder that was used to generate `negative_prompt_embeds`."
+            )
+        if max_sequence_length is not None and max_sequence_length > 512:
+            raise ValueError(f"`max_sequence_length` cannot be greater than 512 but is {max_sequence_length}")
+    @staticmethod
+    def _prepare_latent_image_ids(batch_size, height, width, device, dtype):
+        latent_image_ids = torch.zeros(height, width, 3)
+        latent_image_ids[..., 1] = latent_image_ids[..., 1] + torch.arange(height)[:, None]
+        latent_image_ids[..., 2] = latent_image_ids[..., 2] + torch.arange(width)[None, :]
+        latent_image_id_height, latent_image_id_width, latent_image_id_channels = latent_image_ids.shape
+        latent_image_ids = latent_image_ids.reshape(
+            latent_image_id_height * latent_image_id_width, latent_image_id_channels
+        )
+        return latent_image_ids.to(device=device, dtype=dtype)
+    @staticmethod
+    def _pack_latents(latents, batch_size, num_channels_latents, height, width):
+        latents = latents.view(batch_size, num_channels_latents, height // 2, 2, width // 2, 2)
+        latents = latents.permute(0, 2, 4, 1, 3, 5)
+        latents = latents.reshape(batch_size, (height // 2) * (width // 2), num_channels_latents * 4)
+        return latents
+    @staticmethod
+    def _unpack_latents(latents, height, width, vae_scale_factor):
+        batch_size, num_patches, channels = latents.shape
+        # VAE applies 8x compression on images but we must also account for packing which requires
+        # latent height and width to be divisible by 2.
+        height = 2 * (int(height) // (vae_scale_factor * 2))
+        width = 2 * (int(width) // (vae_scale_factor * 2))
+        latents = latents.view(batch_size, height // 2, width // 2, channels // 4, 2, 2)
+        latents = latents.permute(0, 3, 1, 4, 2, 5)
+        latents = latents.reshape(batch_size, channels // (2 * 2), height, width)
+        return latents
+    def enable_vae_slicing(self):
+        r"""
+        Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
+        compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
+        """
+        self.vae.enable_slicing()
+    def disable_vae_slicing(self):
+        r"""
+        Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
+        computing decoding in one step.
+        """
+        self.vae.disable_slicing()
+    def enable_vae_tiling(self):
+        r"""
+        Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
+        compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
+        processing larger images.
+        """
+        self.vae.enable_tiling()
+    def disable_vae_tiling(self):
+        r"""
+        Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
+        computing decoding in one step.
+        """
+        self.vae.disable_tiling()
+    def prepare_latents(
+        self,
+        batch_size,
+        num_channels_latents,
+        height,
+        width,
+        dtype,
+        device,
+        generator,
+        latents=None,
+    ):
+        # VAE applies 8x compression on images but we must also account for packing which requires
+        # latent height and width to be divisible by 2.
+        height = 2 * (int(height) // (self.vae_scale_factor * 2))
+        width = 2 * (int(width) // (self.vae_scale_factor * 2))
+        shape = (batch_size, num_channels_latents, height, width)
+        if latents is not None:
+            latent_image_ids = self._prepare_latent_image_ids(batch_size, height // 2, width // 2, device, dtype)
+            return latents.to(device=device, dtype=dtype), latent_image_ids
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+        latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        latents = self._pack_latents(latents, batch_size, num_channels_latents, height, width)
+        latent_image_ids = self._prepare_latent_image_ids(batch_size, height // 2, width // 2, device, dtype)
+        return latents, latent_image_ids
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+    @property
+    def joint_attention_kwargs(self):
+        return self._joint_attention_kwargs
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+    @property
+    def current_timestep(self):
+        return self._current_timestep
+    @property
+    def interrupt(self):
+        return self._interrupt
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        prompt_2: Optional[Union[str, List[str]]] = None,
+        negative_prompt: Union[str, List[str]] = None,
+        negative_prompt_2: Optional[Union[str, List[str]]] = None,
+        true_cfg_scale: float = 1.0,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 28,
+        sigmas: Optional[List[float]] = None,
+        guidance_scale: float = 3.5,
+        num_images_per_prompt: Optional[int] = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
+        negative_ip_adapter_image: Optional[PipelineImageInput] = None,
+        negative_ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        joint_attention_kwargs: Optional[Dict[str, Any]] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        max_sequence_length: int = 512,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                will be used instead.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `true_cfg_scale` is
+                not greater than `1`).
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
+                `text_encoder_2`. If not defined, `negative_prompt` is used in all the text-encoders.
+            true_cfg_scale (`float`, *optional*, defaults to 1.0):
+                When > 1.0 and a provided `negative_prompt`, enables true classifier-free guidance.
+            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The height in pixels of the generated image. This is set to 1024 by default for the best results.
+            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The width in pixels of the generated image. This is set to 1024 by default for the best results.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            sigmas (`List[float]`, *optional*):
+                Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
+                their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
+                will be used.
+            guidance_scale (`float`, *optional*, defaults to 3.5):
+                Guidance scale as defined in [Classifier-Free Diffusion
+                Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
+                of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
+                `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
+                the text `prompt`, usually at the expense of lower image quality.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will be generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
+            ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*):
+                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
+                IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. If not
+                provided, embeddings are computed from the `ip_adapter_image` input argument.
+            negative_ip_adapter_image:
+                (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
+            negative_ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*):
+                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
+                IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. If not
+                provided, embeddings are computed from the `ip_adapter_image` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.flux.FluxPipelineOutput`] instead of a plain tuple.
+            joint_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+            max_sequence_length (`int` defaults to 512): Maximum sequence length to use with the `prompt`.
+        Examples:
+        Returns:
+            [`~pipelines.flux.FluxPipelineOutput`] or `tuple`: [`~pipelines.flux.FluxPipelineOutput`] if `return_dict`
+            is True, otherwise a `tuple`. When returning a tuple, the first element is a list with the generated
+            images.
+        """
+        height = height or self.default_sample_size * self.vae_scale_factor
+        width = width or self.default_sample_size * self.vae_scale_factor
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            prompt_2,
+            height,
+            width,
+            negative_prompt=negative_prompt,
+            negative_prompt_2=negative_prompt_2,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+            callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
+            max_sequence_length=max_sequence_length,
+        )
+        self._guidance_scale = guidance_scale
+        self._joint_attention_kwargs = joint_attention_kwargs
+        self._current_timestep = None
+        self._interrupt = False
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        device = self._execution_device
+        lora_scale = (
+            self.joint_attention_kwargs.get("scale", None) if self.joint_attention_kwargs is not None else None
+        )
+        has_neg_prompt = negative_prompt is not None or (
+            negative_prompt_embeds is not None and negative_pooled_prompt_embeds is not None
+        )
+        do_true_cfg = true_cfg_scale > 1 and has_neg_prompt
+        (
+            prompt_embeds,
+            pooled_prompt_embeds,
+            text_ids,
+        ) = self.encode_prompt(
+            prompt=prompt,
+            prompt_2=prompt_2,
+            prompt_embeds=prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            max_sequence_length=max_sequence_length,
+            lora_scale=lora_scale,
+        )
+        if do_true_cfg:
+            (
+                negative_prompt_embeds,
+                negative_pooled_prompt_embeds,
+                negative_text_ids,
+            ) = self.encode_prompt(
+                prompt=negative_prompt,
+                prompt_2=negative_prompt_2,
+                prompt_embeds=negative_prompt_embeds,
+                pooled_prompt_embeds=negative_pooled_prompt_embeds,
+                device=device,
+                num_images_per_prompt=num_images_per_prompt,
+                max_sequence_length=max_sequence_length,
+                lora_scale=lora_scale,
+            )
+        # 4. Prepare latent variables
+        num_channels_latents = self.transformer.config.in_channels // 4
+        latents, latent_image_ids = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+        # 5. Prepare timesteps
+        sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps) if sigmas is None else sigmas
+        image_seq_len = latents.shape[1]
+        mu = calculate_shift(
+            image_seq_len,
+            self.scheduler.config.get("base_image_seq_len", 256),
+            self.scheduler.config.get("max_image_seq_len", 4096),
+            self.scheduler.config.get("base_shift", 0.5),
+            self.scheduler.config.get("max_shift", 1.15),
+        )
+        timesteps, num_inference_steps = retrieve_timesteps(
+            self.scheduler,
+            num_inference_steps,
+            device,
+            sigmas=sigmas,
+            mu=mu,
+        )
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+        self._num_timesteps = len(timesteps)
+        # handle guidance
+        if self.transformer.config.guidance_embeds:
+            guidance = torch.full([1], guidance_scale, device=device, dtype=torch.float32)
+            guidance = guidance.expand(latents.shape[0])
+        else:
+            guidance = None
+        if (ip_adapter_image is not None or ip_adapter_image_embeds is not None) and (
+            negative_ip_adapter_image is None and negative_ip_adapter_image_embeds is None
+        ):
+            negative_ip_adapter_image = np.zeros((width, height, 3), dtype=np.uint8)
+            negative_ip_adapter_image = [negative_ip_adapter_image] * self.transformer.encoder_hid_proj.num_ip_adapters
+        elif (ip_adapter_image is None and ip_adapter_image_embeds is None) and (
+            negative_ip_adapter_image is not None or negative_ip_adapter_image_embeds is not None
+        ):
+            ip_adapter_image = np.zeros((width, height, 3), dtype=np.uint8)
+            ip_adapter_image = [ip_adapter_image] * self.transformer.encoder_hid_proj.num_ip_adapters
+        if self.joint_attention_kwargs is None:
+            self._joint_attention_kwargs = {}
+        image_embeds = None
+        negative_image_embeds = None
+        if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
+            image_embeds = self.prepare_ip_adapter_image_embeds(
+                ip_adapter_image,
+                ip_adapter_image_embeds,
+                device,
+                batch_size * num_images_per_prompt,
+            )
+        if negative_ip_adapter_image is not None or negative_ip_adapter_image_embeds is not None:
+            negative_image_embeds = self.prepare_ip_adapter_image_embeds(
+                negative_ip_adapter_image,
+                negative_ip_adapter_image_embeds,
+                device,
+                batch_size * num_images_per_prompt,
+            )
+        # 6. Denoising loop
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+                self._current_timestep = t
+                if image_embeds is not None:
+                    self._joint_attention_kwargs["ip_adapter_image_embeds"] = image_embeds
+                # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+                timestep = t.expand(latents.shape[0]).to(latents.dtype)
+                noise_pred = self.transformer(
+                    hidden_states=latents,
+                    timestep=timestep / 1000,
+                    guidance=guidance,
+                    pooled_projections=pooled_prompt_embeds,
+                    encoder_hidden_states=prompt_embeds,
+                    txt_ids=text_ids,
+                    img_ids=latent_image_ids,
+                    joint_attention_kwargs=self.joint_attention_kwargs,
+                    return_dict=False,
+                )[0]
+                if do_true_cfg:
+                    if negative_image_embeds is not None:
+                        self._joint_attention_kwargs["ip_adapter_image_embeds"] = negative_image_embeds
+                    neg_noise_pred = self.transformer(
+                        hidden_states=latents,
+                        timestep=timestep / 1000,
+                        guidance=guidance,
+                        pooled_projections=negative_pooled_prompt_embeds,
+                        encoder_hidden_states=negative_prompt_embeds,
+                        txt_ids=negative_text_ids,
+                        img_ids=latent_image_ids,
+                        joint_attention_kwargs=self.joint_attention_kwargs,
+                        return_dict=False,
+                    )[0]
+                    noise_pred = neg_noise_pred + true_cfg_scale * (noise_pred - neg_noise_pred)
+                # compute the previous noisy sample x_t -> x_t-1
+                latents_dtype = latents.dtype
+                latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
+                if latents.dtype != latents_dtype:
+                    if torch.backends.mps.is_available():
+                        # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
+                        latents = latents.to(latents_dtype)
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                if XLA_AVAILABLE:
+                    xm.mark_step()
+        self._current_timestep = None
+        if output_type == "latent":
+            image = latents
+        else:
+            latents = self._unpack_latents(latents, height, width, self.vae_scale_factor)
+            latents = (latents / self.vae.config.scaling_factor) + self.vae.config.shift_factor
+            image = self.vae.decode(latents, return_dict=False)[0]
+            image = self.image_processor.postprocess(image, output_type=output_type)
+        # Offload all models
+        self.maybe_free_model_hooks()
+        if not return_dict:
+            return (image,)
+        return FluxPipelineOutput(images=image)
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def sde_sampling(
+        self,
+        prompt: Union[str, List[str]] = None,
+        prompt_2: Optional[Union[str, List[str]]] = None,
+        negative_prompt: Union[str, List[str]] = None,
+        negative_prompt_2: Optional[Union[str, List[str]]] = None,
+        true_cfg_scale: float = 1.0,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 28,
+        sigmas: Optional[List[float]] = None,
+        guidance_scale: float = 3.5,
+        num_images_per_prompt: Optional[int] = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
+        negative_ip_adapter_image: Optional[PipelineImageInput] = None,
+        negative_ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        joint_attention_kwargs: Optional[Dict[str, Any]] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        max_sequence_length: int = 512,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                will be used instead.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `true_cfg_scale` is
+                not greater than `1`).
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
+                `text_encoder_2`. If not defined, `negative_prompt` is used in all the text-encoders.
+            true_cfg_scale (`float`, *optional*, defaults to 1.0):
+                When > 1.0 and a provided `negative_prompt`, enables true classifier-free guidance.
+            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The height in pixels of the generated image. This is set to 1024 by default for the best results.
+            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The width in pixels of the generated image. This is set to 1024 by default for the best results.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            sigmas (`List[float]`, *optional*):
+                Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
+                their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
+                will be used.
+            guidance_scale (`float`, *optional*, defaults to 3.5):
+                Guidance scale as defined in [Classifier-Free Diffusion
+                Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
+                of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
+                `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
+                the text `prompt`, usually at the expense of lower image quality.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will be generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
+            ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*):
+                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
+                IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. If not
+                provided, embeddings are computed from the `ip_adapter_image` input argument.
+            negative_ip_adapter_image:
+                (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
+            negative_ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*):
+                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
+                IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. If not
+                provided, embeddings are computed from the `ip_adapter_image` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.flux.FluxPipelineOutput`] instead of a plain tuple.
+            joint_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+            max_sequence_length (`int` defaults to 512): Maximum sequence length to use with the `prompt`.
+        Examples:
+        Returns:
+            [`~pipelines.flux.FluxPipelineOutput`] or `tuple`: [`~pipelines.flux.FluxPipelineOutput`] if `return_dict`
+            is True, otherwise a `tuple`. When returning a tuple, the first element is a list with the generated
+            images.
+        """
+        height = height or self.default_sample_size * self.vae_scale_factor
+        width = width or self.default_sample_size * self.vae_scale_factor
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            prompt_2,
+            height,
+            width,
+            negative_prompt=negative_prompt,
+            negative_prompt_2=negative_prompt_2,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+            callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
+            max_sequence_length=max_sequence_length,
+        )
+        self._guidance_scale = guidance_scale
+        self._joint_attention_kwargs = joint_attention_kwargs
+        self._current_timestep = None
+        self._interrupt = False
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        device = self._execution_device
+        lora_scale = (
+            self.joint_attention_kwargs.get("scale", None) if self.joint_attention_kwargs is not None else None
+        )
+        has_neg_prompt = negative_prompt is not None or (
+            negative_prompt_embeds is not None and negative_pooled_prompt_embeds is not None
+        )
+        do_true_cfg = true_cfg_scale > 1 and has_neg_prompt
+        (
+            prompt_embeds,
+            pooled_prompt_embeds,
+            text_ids,
+        ) = self.encode_prompt(
+            prompt=prompt,
+            prompt_2=prompt_2,
+            prompt_embeds=prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            max_sequence_length=max_sequence_length,
+            lora_scale=lora_scale,
+        )
+        if do_true_cfg:
+            (
+                negative_prompt_embeds,
+                negative_pooled_prompt_embeds,
+                negative_text_ids,
+            ) = self.encode_prompt(
+                prompt=negative_prompt,
+                prompt_2=negative_prompt_2,
+                prompt_embeds=negative_prompt_embeds,
+                pooled_prompt_embeds=negative_pooled_prompt_embeds,
+                device=device,
+                num_images_per_prompt=num_images_per_prompt,
+                max_sequence_length=max_sequence_length,
+                lora_scale=lora_scale,
+            )
+        # 4. Prepare latent variables
+        num_channels_latents = self.transformer.config.in_channels // 4
+        latents, latent_image_ids = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+        # 5. Prepare timesteps
+        sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps) if sigmas is None else sigmas
+        image_seq_len = latents.shape[1]
+        mu = calculate_shift(
+            image_seq_len,
+            self.scheduler.config.get("base_image_seq_len", 256),
+            self.scheduler.config.get("max_image_seq_len", 4096),
+            self.scheduler.config.get("base_shift", 0.5),
+            self.scheduler.config.get("max_shift", 1.15),
+        )
+        timesteps, num_inference_steps = retrieve_timesteps(
+            self.scheduler,
+            num_inference_steps,
+            device,
+            sigmas=sigmas,
+            mu=mu,
+        )
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+        self._num_timesteps = len(timesteps)
+        # handle guidance
+        if self.transformer.config.guidance_embeds:
+            guidance = torch.full([1], guidance_scale, device=device, dtype=torch.float32)
+            guidance = guidance.expand(latents.shape[0])
+        else:
+            guidance = None
+        if (ip_adapter_image is not None or ip_adapter_image_embeds is not None) and (
+            negative_ip_adapter_image is None and negative_ip_adapter_image_embeds is None
+        ):
+            negative_ip_adapter_image = np.zeros((width, height, 3), dtype=np.uint8)
+            negative_ip_adapter_image = [negative_ip_adapter_image] * self.transformer.encoder_hid_proj.num_ip_adapters
+        elif (ip_adapter_image is None and ip_adapter_image_embeds is None) and (
+            negative_ip_adapter_image is not None or negative_ip_adapter_image_embeds is not None
+        ):
+            ip_adapter_image = np.zeros((width, height, 3), dtype=np.uint8)
+            ip_adapter_image = [ip_adapter_image] * self.transformer.encoder_hid_proj.num_ip_adapters
+        if self.joint_attention_kwargs is None:
+            self._joint_attention_kwargs = {}
+        image_embeds = None
+        negative_image_embeds = None
+        if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
+            image_embeds = self.prepare_ip_adapter_image_embeds(
+                ip_adapter_image,
+                ip_adapter_image_embeds,
+                device,
+                batch_size * num_images_per_prompt,
+            )
+        if negative_ip_adapter_image is not None or negative_ip_adapter_image_embeds is not None:
+            negative_image_embeds = self.prepare_ip_adapter_image_embeds(
+                negative_ip_adapter_image,
+                negative_ip_adapter_image_embeds,
+                device,
+                batch_size * num_images_per_prompt,
+            )
+        # 6. Denoising loop
+        prev_latents = []
+        pred_latents = []
+        # preds_lst = []
+        states = {
+            "timestep": [],
+            "guidance": [],
+            "pooled_projections": [],
+            "encoder_hidden_states": [],
+            "txt_ids": None,
+            "img_ids": None,
+        }
+        log_probs = []
+        ts = []
+        states["txt_ids"] = text_ids if text_ids is not None else None
+        states["img_ids"] = latent_image_ids if latent_image_ids is not None else None
+        # self.scheduler.set_begin_index(0)
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+                self._current_timestep = t
+                if image_embeds is not None:
+                    self._joint_attention_kwargs["ip_adapter_image_embeds"] = image_embeds
+                timestep = (t.expand(latents.shape[0])/ 1000.).to(latents.dtype)
+                states["timestep"].append(timestep.unsqueeze(1))  # Unsqueezed if needed for batch/timestep handling
+                states["guidance"].append(guidance.unsqueeze(1) if torch.is_tensor(guidance) else guidance)  # Handle if tensor
+                states["pooled_projections"].append(pooled_prompt_embeds.unsqueeze(1) if pooled_prompt_embeds is not None else None)  # Unsqueezed along seq/batch if applicable
+                states["encoder_hidden_states"].append(prompt_embeds.unsqueeze(1) if prompt_embeds is not None else None)  # Unsqueezed along seq dim if needed
+                ts.append(t.expand(latents.shape[0]).unsqueeze(1))
+                prev_latents.append(latents.detach().clone().unsqueeze(1))
+                noise_pred = self.transformer(
+                    hidden_states=latents,
+                    timestep=timestep,
+                    guidance=guidance,
+                    pooled_projections=pooled_prompt_embeds,
+                    encoder_hidden_states=prompt_embeds,
+                    txt_ids=text_ids,
+                    img_ids=latent_image_ids,
+                    joint_attention_kwargs=self.joint_attention_kwargs,
+                    return_dict=False,
+                )[0]
+                if do_true_cfg:
+                    if negative_image_embeds is not None:
+                        self._joint_attention_kwargs["ip_adapter_image_embeds"] = negative_image_embeds
+                    neg_noise_pred = self.transformer(
+                        hidden_states=latents,
+                        timestep=timestep,
+                        guidance=guidance,
+                        pooled_projections=negative_pooled_prompt_embeds,
+                        encoder_hidden_states=negative_prompt_embeds,
+                        txt_ids=negative_text_ids,
+                        img_ids=latent_image_ids,
+                        joint_attention_kwargs=self.joint_attention_kwargs,
+                        return_dict=False,
+                    )[0]
+                    noise_pred = neg_noise_pred + true_cfg_scale * (noise_pred - neg_noise_pred)
+                latents_dtype = latents.dtype
+                latents, log_prob, prev_latents_mean, std_dev_t = sde_step_with_logprob(self.scheduler, noise_pred.float(), t.expand(latents.shape[0]), latents.float())
+                log_probs.append(log_prob.detach().clone().unsqueeze(1))
+                pred_latents.append(latents.detach().clone().unsqueeze(1))
+                if latents.dtype != latents_dtype:
+                    latents = latents.to(latents_dtype)
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                if XLA_AVAILABLE:
+                    xm.mark_step()
+        self._current_timestep = None
+        if output_type == "latent":
+            image = latents
+        else:
+            latents = self._unpack_latents(latents, height, width, self.vae_scale_factor)
+            latents = (latents / self.vae.config.scaling_factor) + self.vae.config.shift_factor
+            image = self.vae.decode(latents, return_dict=False)[0]
+            image = self.image_processor.postprocess(image, output_type=output_type)
+        batched_states = {}
+        batch_size = latents.shape[0]
+        num_steps = len(timesteps)
+        for key, value_list in states.items():
+            if value_list is None or len(value_list) == 0:  # Skip None or empty lists
+                batched_states[key] = None
+                continue
+            if value_list[0] is None:  # Handle lists of None (e.g., optional inputs)
+                batched_states[key] = None
+                continue
+            # Concatenate along dim=1
+            if isinstance(value_list, list):
+                concatenated = torch.cat(value_list, dim=1)  # Shape: (batch, steps, ...)
+                if len(concatenated.shape) <= 2:  # 1D tensors (e.g., timestep: batch, steps)
+                    # print(key, concatenated.shape)
+                    batched_states[key] = concatenated.view(-1)
+                else:  # Higher-dim tensors (e.g., latents: batch, steps, channels, h, w)
+                    batched_states[key] = concatenated.view(-1, *concatenated.shape[2:])
+            else:
+                batched_states[key] = value_list
+        # assert 0
+        prev_latents = torch.cat(prev_latents, dim=1)
+        log_probs = torch.cat(log_probs, dim=1)
+        pred_latents = torch.cat(pred_latents, dim=1)
+        ts = torch.cat(ts, dim=1)
+        prev_latents = prev_latents.view(prev_latents.shape[0] * prev_latents.shape[1], *prev_latents.shape[2:])
+        log_probs = log_probs.view(log_probs.shape[0] * log_probs.shape[1], *log_probs.shape[2:])
+        pred_latents = pred_latents.view(pred_latents.shape[0] * pred_latents.shape[1], *pred_latents.shape[2:])
+        ts = ts.view(-1)
+        # Offload all models
+        self.maybe_free_model_hooks()
+        return (image, prev_latents, log_probs, pred_latents, ts, batched_states)
+def sde_step_with_logprob(
+    self: FlowMatchEulerDiscreteScheduler,
+    model_output: torch.FloatTensor,
+    timestep: Union[float, torch.FloatTensor],
+    sample: torch.FloatTensor,
+    prev_sample: Optional[torch.FloatTensor] = None,
+    generator: Optional[torch.Generator] = None,
+) -> Union[FlowMatchEulerDiscreteSchedulerOutput, Tuple]:
+    """
+    Predict the sample from the previous timestep by reversing the SDE. This function propagates the flow
+    process from the learned model outputs (most often the predicted velocity).
+    Args:
+        model_output (`torch.FloatTensor`):
+            The direct output from learned flow model.
+        timestep (`float`):
+            The current discrete timestep in the diffusion chain.
+        sample (`torch.FloatTensor`):
+            A current instance of a sample created by the diffusion process.
+        generator (`torch.Generator`, *optional*):
+            A random number generator.
+    """
+    step_index = [self.index_for_timestep(t) for t in timestep]
+    prev_step_index = [step+1 for step in step_index]
+    sigma = self.sigmas[step_index].view(-1, 1, 1).to(model_output.device)
+    sigma_prev = self.sigmas[prev_step_index].view(-1, 1, 1).to(model_output.device)
+    sigma_max = self.sigmas[1].item()
+    dt = sigma_prev - sigma
+    std_dev_t = torch.sqrt(sigma / (1 - torch.where(sigma == 1, sigma_max, sigma))) * 1.0
+    # our sde
+    prev_sample_mean = sample*(1+std_dev_t**2/(2*sigma)*dt)+model_output*(1+std_dev_t**2*(1-sigma)/(2*sigma))*dt
+    if prev_sample is not None and generator is not None:
+        raise ValueError(
+            "Cannot pass both generator and prev_sample. Please make sure that either `generator` or"
+            " `prev_sample` stays `None`."
+        )
+    if prev_sample is None:
+        variance_noise = randn_tensor(
+            model_output.shape,
+            generator=generator,
+            device=model_output.device,
+            dtype=model_output.dtype,
+        )
+        prev_sample = prev_sample_mean + std_dev_t * torch.sqrt(-1*dt) * variance_noise
+    log_prob = (
+        -((prev_sample.detach() - prev_sample_mean) ** 2) / (2 * ((std_dev_t * torch.sqrt(-1*dt))**2))
+        - torch.log(std_dev_t * torch.sqrt(-1*dt))
+        - torch.log(torch.sqrt(2 * torch.as_tensor(math.pi)))
+    )
+    # mean along all but batch dimension
+    log_prob = log_prob.mean(dim=tuple(range(1, log_prob.ndim)))
+    return prev_sample, log_prob, prev_sample_mean, std_dev_t * torch.sqrt(-1*dt)
+# Copyright 2025 Fu-Yun Wang
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+def sde_step_with_logprob_simple(
+    self: FlowMatchEulerDiscreteScheduler,
+    model_output: torch.FloatTensor,
+    timestep: Union[float, torch.FloatTensor],
+    sample: torch.FloatTensor,
+    prev_sample: Optional[torch.FloatTensor] = None,
+    generator: Optional[torch.Generator] = None,
+):
+    """
+    Predict the sample from the previous timestep by reversing the SDE. This function propagates the flow
+    process from the learned model outputs (most often the predicted velocity).
+    Args:
+        model_output (`torch.FloatTensor`):
+            The direct output from learned flow model.
+        timestep (`float`):
+            The current discrete timestep in the diffusion chain.
+        sample (`torch.FloatTensor`):
+            A current instance of a sample created by the diffusion process.
+        generator (`torch.Generator`, *optional*):
+            A random number generator.
+    """
+    step_index = [self.index_for_timestep(t) for t in timestep]
+    prev_step_index = [step+1 for step in step_index]
+    sigma = self.sigmas[step_index].view(-1, 1, 1).to(model_output.device)
+    sigma_prev = self.sigmas[prev_step_index].view(-1, 1, 1).to(model_output.device)
+    sigma_max = self.sigmas[1].item()
+    dt = sigma_prev - sigma
+    eta = 0.5
+    Dt = - dt * eta
+    prev_sample_mean = sample * (1 - Dt / (1 - torch.where(sigma == 1, sigma_max, sigma))) + model_output * (dt - Dt)
+    std_dev_t = torch.sqrt(2 * Dt * (sigma / (1 - torch.where(sigma == 1, sigma_max, sigma))))
+    if prev_sample is not None and generator is not None:
+        raise ValueError(
+            "Cannot pass both generator and prev_sample. Please make sure that either `generator` or"
+            " `prev_sample` stays `None`."
+        )
+    if prev_sample is None:
+        # Generate noise if not provided
+        variance_noise = randn_tensor(
+            model_output.shape,
+            generator=generator,
+            device=model_output.device,
+            dtype=model_output.dtype,
+        )
+        prev_sample = prev_sample_mean + std_dev_t * variance_noise
+    log_prob = (
+        -((prev_sample.detach() - prev_sample_mean) ** 2) / (2 * (std_dev_t**2))
+        - torch.log(std_dev_t)
+        - torch.log(torch.sqrt(2 * torch.as_tensor(math.pi)))
+    )
+    # mean along all but batch dimension
+    log_prob = log_prob.mean(dim=tuple(range(1, log_prob.ndim)))
+    return prev_sample, log_prob, prev_sample_mean, std_dev_t

unimodel/qwenflux/qwenflux_inference.py ADDED Viewed

	@@ -0,0 +1,418 @@

+# Copyright 2025 Fu-Yun Wang
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import List, Optional, Tuple, Union, Dict
+import torch
+import torch.nn as nn
+from PIL import Image
+import torch.nn.functional as F
+from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, AutoProcessor
+from transformers import Qwen2_5_VLConfig, Qwen2_5_VLModel, Qwen2_5_VLForConditionalGeneration
+from diffusers.utils.torch_utils import randn_tensor
+from diffusers.pipelines.pipeline_utils import numpy_to_pil
+import numpy as np
+from diffusers.schedulers.scheduling_flow_match_euler_discrete import FlowMatchEulerDiscreteScheduler, FlowMatchEulerDiscreteSchedulerOutput
+from diffusers.schedulers import DPMSolverMultistepScheduler
+import math
+from diffusers.utils.torch_utils import randn_tensor
+from diffusers import FluxTransformer2DModel, AutoencoderKL, FlowMatchEulerDiscreteScheduler
+from transformers import CLIPTextModel, CLIPTokenizer, T5EncoderModel, T5TokenizerFast, CLIPTextConfig, T5Config
+from .fluxpipeline import FluxPipeline
+import re
+import datetime
+import os
+def save_grid_image(prompt, images, rows, cols):
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    base_dir = os.path.join("samples", timestamp, prompt[:100])
+    os.makedirs(base_dir, exist_ok=True)
+    filename = os.path.join(base_dir, "grid.jpg")
+    grid_image = create_image_grid(images, rows, cols)
+    grid_image.save(filename)
+    print(f"Saved: {filename}")
+def create_image_grid(images, rows, cols):
+    """Creates a grid of images and returns a single PIL Image."""
+    assert len(images) == rows * cols
+    width, height = images[0].size
+    grid_width = width * cols
+    grid_height = height * rows
+    grid_image = Image.new('RGB', (grid_width, grid_height))
+    for i, image in enumerate(images):
+        x = (i % cols) * width
+        y = (i // cols) * height
+        grid_image.paste(image, (x, y))
+    return grid_image
+def sde_step_with_logprob(
+    self: FlowMatchEulerDiscreteScheduler,
+    model_output: torch.FloatTensor,
+    timestep: Union[float, torch.FloatTensor],
+    sample: torch.FloatTensor,
+    prev_sample: Optional[torch.FloatTensor] = None,
+    generator: Optional[torch.Generator] = None,
+) -> Union[FlowMatchEulerDiscreteSchedulerOutput, Tuple]:
+    """
+    Predict the sample from the previous timestep by reversing the SDE. This function propagates the flow
+    process from the learned model outputs (most often the predicted velocity).
+    Args:
+        model_output (`torch.FloatTensor`):
+            The direct output from learned flow model.
+        timestep (`float`):
+            The current discrete timestep in the diffusion chain.
+        sample (`torch.FloatTensor`):
+            A current instance of a sample created by the diffusion process.
+        generator (`torch.Generator`, *optional*):
+            A random number generator.
+    """
+    step_index = [self.index_for_timestep(t) for t in timestep]
+    prev_step_index = [step+1 for step in step_index]
+    sigma = self.sigmas[step_index].view(-1, 1, 1).to(model_output.device)
+    sigma_prev = self.sigmas[prev_step_index].view(-1, 1, 1).to(model_output.device)
+    sigma_max = self.sigmas[1].item()
+    dt = sigma_prev - sigma
+    std_dev_t = torch.sqrt(sigma / (1 - torch.where(sigma == 1, sigma_max, sigma)))*1.0
+    # our sde
+    prev_sample_mean = sample*(1+std_dev_t**2/(2*sigma)*dt)+model_output*(1+std_dev_t**2*(1-sigma)/(2*sigma))*dt
+    if prev_sample is not None and generator is not None:
+        raise ValueError(
+            "Cannot pass both generator and prev_sample. Please make sure that either `generator` or"
+            " `prev_sample` stays `None`."
+        )
+    if prev_sample is None:
+        variance_noise = randn_tensor(
+            model_output.shape,
+            generator=generator,
+            device=model_output.device,
+            dtype=model_output.dtype,
+        )
+        prev_sample = prev_sample_mean + std_dev_t * torch.sqrt(-1*dt) * variance_noise
+    log_prob = (
+        -((prev_sample.detach() - prev_sample_mean) ** 2) / (2 * ((std_dev_t * torch.sqrt(-1*dt))**2))
+        - torch.log(std_dev_t * torch.sqrt(-1*dt))
+        - torch.log(torch.sqrt(2 * torch.as_tensor(math.pi)))
+    )
+    # mean along all but batch dimension
+    log_prob = log_prob.mean(dim=tuple(range(1, log_prob.ndim)))
+    return prev_sample, log_prob, prev_sample_mean, std_dev_t * torch.sqrt(-1*dt)
+# Copyright 2025 Fu-Yun Wang
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+def sde_step_with_logprob_simple(
+    self: FlowMatchEulerDiscreteScheduler,
+    model_output: torch.FloatTensor,
+    timestep: Union[float, torch.FloatTensor],
+    sample: torch.FloatTensor,
+    prev_sample: Optional[torch.FloatTensor] = None,
+    generator: Optional[torch.Generator] = None,
+):
+    """
+    Predict the sample from the previous timestep by reversing the SDE. This function propagates the flow
+    process from the learned model outputs (most often the predicted velocity).
+    Args:
+        model_output (`torch.FloatTensor`):
+            The direct output from learned flow model.
+        timestep (`float`):
+            The current discrete timestep in the diffusion chain.
+        sample (`torch.FloatTensor`):
+            A current instance of a sample created by the diffusion process.
+        generator (`torch.Generator`, *optional*):
+            A random number generator.
+    """
+    step_index = [self.index_for_timestep(t) for t in timestep]
+    prev_step_index = [step+1 for step in step_index]
+    sigma = self.sigmas[step_index].view(-1, 1, 1, 1).to(model_output.device)
+    sigma_prev = self.sigmas[prev_step_index].view(-1, 1, 1, 1).to(model_output.device)
+    sigma_max = self.sigmas[1].item()
+    dt = sigma_prev - sigma
+    eta = 0.5
+    Dt = - dt * eta
+    prev_sample_mean = sample * (1 - Dt / (1 - torch.where(sigma == 1, sigma_max, sigma))) + model_output * (dt - Dt)
+    std_dev_t = torch.sqrt(2 * Dt * (sigma / (1 - torch.where(sigma == 1, sigma_max, sigma))))
+    if prev_sample is not None and generator is not None:
+        raise ValueError(
+            "Cannot pass both generator and prev_sample. Please make sure that either `generator` or"
+            " `prev_sample` stays `None`."
+        )
+    if prev_sample is None:
+        # Generate noise if not provided
+        variance_noise = randn_tensor(
+            model_output.shape,
+            generator=generator,
+            device=model_output.device,
+            dtype=model_output.dtype,
+        )
+        prev_sample = prev_sample_mean + std_dev_t * variance_noise
+    log_prob = (
+        -((prev_sample.detach() - prev_sample_mean) ** 2) / (2 * (std_dev_t**2))
+        - torch.log(std_dev_t)
+        - torch.log(torch.sqrt(2 * torch.as_tensor(math.pi)))
+    )
+    # mean along all but batch dimension
+    log_prob = log_prob.mean(dim=tuple(range(1, log_prob.ndim)))
+    return prev_sample, log_prob, prev_sample_mean, std_dev_t
+class QwenFluxMetaModel:
+    def __init__(self, config):
+        super(QwenFluxMetaModel, self).__init__(config)
+        if hasattr(config, "diffusion_expert"):
+            ckpt_id = "black-forest-labs/FLUX.1-dev"
+            # Load configuration for each component
+            transformer_config = FluxTransformer2DModel.load_config(ckpt_id, subfolder="transformer")
+            vae_config = AutoencoderKL.load_config(ckpt_id, subfolder="vae")
+            text_encoder_config = CLIPTextConfig.from_pretrained(ckpt_id, subfolder="text_encoder")
+            text_encoder_2_config = T5Config.from_pretrained(ckpt_id, subfolder="text_encoder_2")
+            # Initialize components from their configurations
+            self.transformer = FluxTransformer2DModel.from_config(transformer_config)
+            self.vae = AutoencoderKL.from_config(vae_config)
+            self.text_encoder = CLIPTextModel(text_encoder_config)
+            self.text_encoder_2 = T5EncoderModel(text_encoder_2_config)
+            # Initialize tokenizers (these don't use from_config as they are not models)
+            self.tokenizer = CLIPTokenizer.from_pretrained(ckpt_id, subfolder="tokenizer")
+            self.tokenizer_2 = T5TokenizerFast.from_pretrained(ckpt_id, subfolder="tokenizer_2")
+            self.scheduler = FlowMatchEulerDiscreteScheduler.from_pretrained(ckpt_id, subfolder="scheduler")
+            # Create the pipeline configuration dictionary
+            pipeline_config = {
+                "transformer": self.transformer,
+                "scheduler": self.scheduler,
+                "vae": self.vae,
+                "text_encoder": self.text_encoder,
+                "text_encoder_2": self.text_encoder_2,
+                "tokenizer": self.tokenizer,
+                "tokenizer_2": self.tokenizer_2,
+            }
+            self.diffusion_expert = FluxPipeline(**pipeline_config)
+    def initialize_diffusion_expert(self, fsdp=None):
+        if getattr(self, 'diffusion_expert', None) is None:
+            print("random initiation the diffusion expert !!!")
+            self.diffusion_expert = FluxPipeline.from_pretrained("black-forest-labs/FLUX.1-dev", revision="main", torch_dtype=torch.bfloat16).to(torch.bfloat16)
+            self.text_encoder = self.diffusion_expert.text_encoder
+            self.text_encoder_2 = self.diffusion_expert.text_encoder_2
+            self.tokenizer = self.diffusion_expert.tokenizer
+            self.tokenizer_2 = self.diffusion_expert.tokenizer_2
+            self.vae = self.diffusion_expert.vae
+            self.transformer = self.diffusion_expert.transformer
+            self.scheduler = self.diffusion_expert.scheduler
+            self.config.diffusion_expert =  "flux"
+class QwenFluxConfig(Qwen2_5_VLConfig):
+    model_type = "QwenFlux"
+class QwenFluxModel(QwenFluxMetaModel, Qwen2_5_VLModel):
+    config_class = QwenFluxConfig
+    def __init__(self, config: Qwen2_5_VLConfig):
+        super(QwenFluxModel, self).__init__(config)
+class QwenFluxForInferenceLM(Qwen2_5_VLForConditionalGeneration):
+    config_class = QwenFluxConfig
+    def __init__(self, config):
+        Qwen2_5_VLForConditionalGeneration.__init__(self, config)
+        config.model_type = "QwenFlux"
+        self.model = QwenFluxModel(config)
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        self.post_init()
+    def get_model(self):
+        return self.model
+    @torch.no_grad()
+    def generate_image(
+        self,
+        texts: List[str],
+        diffusion_kwargs: Optional[Dict] = dict(guidance_scale = 3.5, num_inference_steps=25),
+        sde_sampling: Optional[bool] = False,
+    ):
+        if isinstance(texts, str):
+            texts = [texts]
+        if not sde_sampling:
+            output_img = self.model.diffusion_expert(
+                texts,
+                max_sequence_length=512,
+                **diffusion_kwargs,
+            ).images
+            return output_img
+        else:
+            return self.model.diffusion_expert.sde_sampling(
+                texts,
+                max_sequence_length=512,
+                **diffusion_kwargs,
+            )
+    def extract_thinking_content(self, text: str) -> str:
+        pattern = r'<answer>(.*?)</answer>'
+        matches = re.findall(pattern, text, re.DOTALL)
+        if matches:
+            return matches[-1].strip().replace("<answer>", "").replace("</answer>", "")
+        else:
+            return text.strip().replace("<answer>", "").replace("</answer>", "")
+    @torch.no_grad()
+    def generate_image_cot(
+        self,
+        texts: List[str],
+        processor: Optional[object] = None,
+        diffusion_kwargs: Optional[Dict] = dict(guidance_scale = 3.5, num_inference_steps=25),
+        llm_kwargs: Optional[Dict] = dict(max_new_tokens=256, temperature=0.7, top_p=0.9, do_sample=True),
+        cot_prompt_template: Optional[str] = None,
+    ):
+        if isinstance(texts, str):
+            texts = [texts]
+        if cot_prompt_template is None:
+            # cot_prompt_template = """Please improve the following image generation prompt to make it more detailed and specific for better image quality. Think step by step about what visual elements would make this image more compelling. Original prompt: {original_prompt}. Please provide the improved prompt in <thinking> </thinking> tags."""
+            cot_prompt_template = """Please provide an enhanced prompt for the following image generation prompt to make the image more realistic, detailed, with clear separation and precise alignment of all entities.
+            Original prompt: {original_prompt}.  Directly provide the improved prompt in <answer> </answer> tags."""
+        improved_prompts = []
+        for text in texts:
+            cot_input = cot_prompt_template.format(original_prompt=text)
+            messages = [{"role": "user", "content": cot_input}]
+            input_text_formatted = processor.apply_chat_template(
+                messages, tokenize=False, add_generation_prompt=True
+            )
+            model_inputs = processor(
+                text=[input_text_formatted],
+                return_tensors="pt"
+            ).to(self.device)
+            generated_ids = self.generate(
+                **model_inputs,
+                **llm_kwargs,
+                eos_token_id=processor.tokenizer.eos_token_id,
+                pad_token_id=processor.tokenizer.pad_token_id
+            )
+            generated_text = processor.batch_decode(
+                generated_ids[:, model_inputs['input_ids'].shape[1]:],
+                skip_special_tokens=True
+            )
+            improved_prompt = [self.extract_thinking_content(decode_text) for decode_text in generated_text]
+            improved_prompts.extend(improved_prompt)
+            print(f"Original prompt: {text}")
+            print(f"Improved prompt: {improved_prompt}")
+            print("-" * 50)
+        output_images = self.generate_image(improved_prompts, diffusion_kwargs)
+        return {
+            'images': output_images,
+            'original_prompts': texts,
+            'improved_prompts': improved_prompts
+        }
+AutoConfig.register("QwenFlux", QwenFluxConfig)
+AutoModelForCausalLM.register(QwenFluxConfig, QwenFluxForInferenceLM)
+if __name__ == "__main__":
+    model = QwenFluxForInferenceLM.from_pretrained("Qwen/Qwen2.5-VL-3B-Instruct",torch_dtype=torch.bfloat16)
+    model.model.initialize_diffusion_expert()
+    model.model.diffusion_expert.to("cuda:0")
+    model.to("cuda:0")
+    AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-3B-Instruct")
+    text = ["a photo of a cat"]
+    images = model.generate_image(text)
+    images[0].save("test_flux.png")
+    model.save_pretrained("outputs/pretrain/qwenflux")
+    model = QwenFluxForInferenceLM.from_pretrained("outputs/pretrain/qwenflux", torch_dtype=torch.bfloat16)
+    model.to("cuda:0")
+    processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-3B-Instruct")
+    text = ["a photo of a cat"]
+    images = model.generate_image(text)
+    images[0].save("test_flux.jpg")
+    outputs = model.generate_image_cot(text, processor = processor)
+    outputs['images'][0].save("test_flux_cot.jpg")

unimodel/qwenkontext/fluxkontext_pipeline.py ADDED Viewed

	@@ -0,0 +1,1161 @@

+# Copyright 2025 Black Forest Labs and The HuggingFace Team. All rights reserved.
+# Copyright 2025 Fu-Yun Wang
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Union
+import numpy as np
+import torch
+from transformers import (
+    CLIPImageProcessor,
+    CLIPTextModel,
+    CLIPTokenizer,
+    CLIPVisionModelWithProjection,
+    T5EncoderModel,
+    T5TokenizerFast,
+)
+from diffusers.image_processor import PipelineImageInput, VaeImageProcessor
+from diffusers.loaders import FluxIPAdapterMixin, FluxLoraLoaderMixin, FromSingleFileMixin, TextualInversionLoaderMixin
+from diffusers.models import AutoencoderKL, FluxTransformer2DModel
+from diffusers.schedulers import FlowMatchEulerDiscreteScheduler
+from diffusers.utils import (
+    USE_PEFT_BACKEND,
+    deprecate,
+    is_torch_xla_available,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from diffusers.utils.torch_utils import randn_tensor
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+from diffusers.pipelines.flux.pipeline_output import FluxPipelineOutput
+if is_torch_xla_available():
+    import torch_xla.core.xla_model as xm
+    XLA_AVAILABLE = True
+else:
+    XLA_AVAILABLE = False
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import FluxKontextPipeline
+        >>> from diffusers.utils import load_image
+        >>> pipe = FluxKontextPipeline.from_pretrained(
+        ...     "black-forest-labs/FLUX.1-Kontext-dev", torch_dtype=torch.bfloat16
+        ... )
+        >>> pipe.to("cuda")
+        >>> image = load_image(
+        ...     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/yarn-art-pikachu.png"
+        ... ).convert("RGB")
+        >>> prompt = "Make Pikachu hold a sign that says 'Black Forest Labs is awesome', yarn art style, detailed, vibrant colors"
+        >>> image = pipe(
+        ...     image=image,
+        ...     prompt=prompt,
+        ...     guidance_scale=2.5,
+        ...     generator=torch.Generator().manual_seed(42),
+        ... ).images[0]
+        >>> image.save("output.png")
+        ```
+"""
+PREFERRED_KONTEXT_RESOLUTIONS = [
+    (672, 1568),
+    (688, 1504),
+    (720, 1456),
+    (752, 1392),
+    (800, 1328),
+    (832, 1248),
+    (880, 1184),
+    (944, 1104),
+    (1024, 1024),
+    (1104, 944),
+    (1184, 880),
+    (1248, 832),
+    (1328, 800),
+    (1392, 752),
+    (1456, 720),
+    (1504, 688),
+    (1568, 672),
+]
+def calculate_shift(
+    image_seq_len,
+    base_seq_len: int = 256,
+    max_seq_len: int = 4096,
+    base_shift: float = 0.5,
+    max_shift: float = 1.15,
+):
+    m = (max_shift - base_shift) / (max_seq_len - base_seq_len)
+    b = base_shift - m * base_seq_len
+    mu = image_seq_len * m + b
+    return mu
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    r"""
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
+def retrieve_latents(
+    encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
+):
+    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
+        return encoder_output.latent_dist.sample(generator)
+    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
+        return encoder_output.latent_dist.mode()
+    elif hasattr(encoder_output, "latents"):
+        return encoder_output.latents
+    else:
+        raise AttributeError("Could not access latents of provided encoder_output")
+class FluxKontextPipeline(
+    DiffusionPipeline,
+    FluxLoraLoaderMixin,
+    FromSingleFileMixin,
+    TextualInversionLoaderMixin,
+    FluxIPAdapterMixin,
+):
+    r"""
+    The Flux Kontext pipeline for image-to-image and text-to-image generation.
+    Reference: https://bfl.ai/announcements/flux-1-kontext-dev
+    Args:
+        transformer ([`FluxTransformer2DModel`]):
+            Conditional Transformer (MMDiT) architecture to denoise the encoded image latents.
+        scheduler ([`FlowMatchEulerDiscreteScheduler`]):
+            A scheduler to be used in combination with `transformer` to denoise the encoded image latents.
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        text_encoder_2 ([`T5EncoderModel`]):
+            [T5](https://huggingface.co/docs/transformers/en/model_doc/t5#transformers.T5EncoderModel), specifically
+            the [google/t5-v1_1-xxl](https://huggingface.co/google/t5-v1_1-xxl) variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/en/model_doc/clip#transformers.CLIPTokenizer).
+        tokenizer_2 (`T5TokenizerFast`):
+            Second Tokenizer of class
+            [T5TokenizerFast](https://huggingface.co/docs/transformers/en/model_doc/t5#transformers.T5TokenizerFast).
+    """
+    model_cpu_offload_seq = "text_encoder->text_encoder_2->image_encoder->transformer->vae"
+    _optional_components = ["image_encoder", "feature_extractor"]
+    _callback_tensor_inputs = ["latents", "prompt_embeds"]
+    def __init__(
+        self,
+        scheduler: FlowMatchEulerDiscreteScheduler,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        text_encoder_2: T5EncoderModel,
+        tokenizer_2: T5TokenizerFast,
+        transformer: FluxTransformer2DModel,
+        image_encoder: CLIPVisionModelWithProjection = None,
+        feature_extractor: CLIPImageProcessor = None,
+    ):
+        super().__init__()
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            text_encoder_2=text_encoder_2,
+            tokenizer=tokenizer,
+            tokenizer_2=tokenizer_2,
+            transformer=transformer,
+            scheduler=scheduler,
+            image_encoder=image_encoder,
+            feature_extractor=feature_extractor,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) if getattr(self, "vae", None) else 8
+        # Flux latents are turned into 2x2 patches and packed. This means the latent width and height has to be divisible
+        # by the patch size. So the vae scale factor is multiplied by the patch size to account for this
+        self.latent_channels = self.vae.config.latent_channels if getattr(self, "vae", None) else 16
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor * 2)
+        self.tokenizer_max_length = (
+            self.tokenizer.model_max_length if hasattr(self, "tokenizer") and self.tokenizer is not None else 77
+        )
+        self.default_sample_size = 128
+    # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline._get_t5_prompt_embeds
+    def _get_t5_prompt_embeds(
+        self,
+        prompt: Union[str, List[str]] = None,
+        num_images_per_prompt: int = 1,
+        max_sequence_length: int = 512,
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+    ):
+        device = device or self._execution_device
+        dtype = dtype or self.text_encoder.dtype
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        batch_size = len(prompt)
+        if isinstance(self, TextualInversionLoaderMixin):
+            prompt = self.maybe_convert_prompt(prompt, self.tokenizer_2)
+        text_inputs = self.tokenizer_2(
+            prompt,
+            padding="max_length",
+            max_length=max_sequence_length,
+            truncation=True,
+            return_length=False,
+            return_overflowing_tokens=False,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        untruncated_ids = self.tokenizer_2(prompt, padding="longest", return_tensors="pt").input_ids
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer_2.batch_decode(untruncated_ids[:, self.tokenizer_max_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because `max_sequence_length` is set to "
+                f" {max_sequence_length} tokens: {removed_text}"
+            )
+        prompt_embeds = self.text_encoder_2(text_input_ids.to(device), output_hidden_states=False)[0]
+        dtype = self.text_encoder_2.dtype
+        prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+        _, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings and attention mask for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+        return prompt_embeds
+    # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline._get_clip_prompt_embeds
+    def _get_clip_prompt_embeds(
+        self,
+        prompt: Union[str, List[str]],
+        num_images_per_prompt: int = 1,
+        device: Optional[torch.device] = None,
+    ):
+        device = device or self._execution_device
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        batch_size = len(prompt)
+        if isinstance(self, TextualInversionLoaderMixin):
+            prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer_max_length,
+            truncation=True,
+            return_overflowing_tokens=False,
+            return_length=False,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer_max_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.tokenizer_max_length} tokens: {removed_text}"
+            )
+        prompt_embeds = self.text_encoder(text_input_ids.to(device), output_hidden_states=False)
+        # Use pooled output of CLIPTextModel
+        prompt_embeds = prompt_embeds.pooler_output
+        prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt)
+        prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, -1)
+        return prompt_embeds
+    # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        prompt_2: Optional[Union[str, List[str]]] = None,
+        device: Optional[torch.device] = None,
+        num_images_per_prompt: int = 1,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        max_sequence_length: int = 512,
+        lora_scale: Optional[float] = None,
+    ):
+        r"""
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                used in all text-encoders
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            lora_scale (`float`, *optional*):
+                A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+        """
+        device = device or self._execution_device
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, FluxLoraLoaderMixin):
+            self._lora_scale = lora_scale
+            # dynamically adjust the LoRA scale
+            if self.text_encoder is not None and USE_PEFT_BACKEND:
+                scale_lora_layers(self.text_encoder, lora_scale)
+            if self.text_encoder_2 is not None and USE_PEFT_BACKEND:
+                scale_lora_layers(self.text_encoder_2, lora_scale)
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        if prompt_embeds is None:
+            prompt_2 = prompt_2 or prompt
+            prompt_2 = [prompt_2] if isinstance(prompt_2, str) else prompt_2
+            # We only use the pooled prompt output from the CLIPTextModel
+            pooled_prompt_embeds = self._get_clip_prompt_embeds(
+                prompt=prompt,
+                device=device,
+                num_images_per_prompt=num_images_per_prompt,
+            )
+            prompt_embeds = self._get_t5_prompt_embeds(
+                prompt=prompt_2,
+                num_images_per_prompt=num_images_per_prompt,
+                max_sequence_length=max_sequence_length,
+                device=device,
+            )
+        if self.text_encoder is not None:
+            if isinstance(self, FluxLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
+        if self.text_encoder_2 is not None:
+            if isinstance(self, FluxLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder_2, lora_scale)
+        dtype = self.text_encoder.dtype if self.text_encoder is not None else self.transformer.dtype
+        text_ids = torch.zeros(prompt_embeds.shape[1], 3).to(device=device, dtype=dtype)
+        return prompt_embeds, pooled_prompt_embeds, text_ids
+    # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline.encode_image
+    def encode_image(self, image, device, num_images_per_prompt):
+        dtype = next(self.image_encoder.parameters()).dtype
+        if not isinstance(image, torch.Tensor):
+            image = self.feature_extractor(image, return_tensors="pt").pixel_values
+        image = image.to(device=device, dtype=dtype)
+        image_embeds = self.image_encoder(image).image_embeds
+        image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+        return image_embeds
+    # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline.prepare_ip_adapter_image_embeds
+    def prepare_ip_adapter_image_embeds(
+        self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt
+    ):
+        image_embeds = []
+        if ip_adapter_image_embeds is None:
+            if not isinstance(ip_adapter_image, list):
+                ip_adapter_image = [ip_adapter_image]
+            if len(ip_adapter_image) != self.transformer.encoder_hid_proj.num_ip_adapters:
+                raise ValueError(
+                    f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {self.transformer.encoder_hid_proj.num_ip_adapters} IP Adapters."
+                )
+            for single_ip_adapter_image in ip_adapter_image:
+                single_image_embeds = self.encode_image(single_ip_adapter_image, device, 1)
+                image_embeds.append(single_image_embeds[None, :])
+        else:
+            if not isinstance(ip_adapter_image_embeds, list):
+                ip_adapter_image_embeds = [ip_adapter_image_embeds]
+            if len(ip_adapter_image_embeds) != self.transformer.encoder_hid_proj.num_ip_adapters:
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` must have same length as the number of IP Adapters. Got {len(ip_adapter_image_embeds)} image embeds and {self.transformer.encoder_hid_proj.num_ip_adapters} IP Adapters."
+                )
+            for single_image_embeds in ip_adapter_image_embeds:
+                image_embeds.append(single_image_embeds)
+        ip_adapter_image_embeds = []
+        for single_image_embeds in image_embeds:
+            single_image_embeds = torch.cat([single_image_embeds] * num_images_per_prompt, dim=0)
+            single_image_embeds = single_image_embeds.to(device=device)
+            ip_adapter_image_embeds.append(single_image_embeds)
+        return ip_adapter_image_embeds
+    # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline.check_inputs
+    def check_inputs(
+        self,
+        prompt,
+        prompt_2,
+        height,
+        width,
+        negative_prompt=None,
+        negative_prompt_2=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        pooled_prompt_embeds=None,
+        negative_pooled_prompt_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+        max_sequence_length=None,
+    ):
+        if height % (self.vae_scale_factor * 2) != 0 or width % (self.vae_scale_factor * 2) != 0:
+            logger.warning(
+                f"`height` and `width` have to be divisible by {self.vae_scale_factor * 2} but are {height} and {width}. Dimensions will be resized accordingly"
+            )
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt_2 is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt_2`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+        elif prompt_2 is not None and (not isinstance(prompt_2, str) and not isinstance(prompt_2, list)):
+            raise ValueError(f"`prompt_2` has to be of type `str` or `list` but is {type(prompt_2)}")
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+        elif negative_prompt_2 is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt_2`: {negative_prompt_2} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+        if prompt_embeds is not None and pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `prompt_embeds` are provided, `pooled_prompt_embeds` also have to be passed. Make sure to generate `pooled_prompt_embeds` from the same text encoder that was used to generate `prompt_embeds`."
+            )
+        if negative_prompt_embeds is not None and negative_pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `negative_prompt_embeds` are provided, `negative_pooled_prompt_embeds` also have to be passed. Make sure to generate `negative_pooled_prompt_embeds` from the same text encoder that was used to generate `negative_prompt_embeds`."
+            )
+        if max_sequence_length is not None and max_sequence_length > 512:
+            raise ValueError(f"`max_sequence_length` cannot be greater than 512 but is {max_sequence_length}")
+    @staticmethod
+    # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline._prepare_latent_image_ids
+    def _prepare_latent_image_ids(batch_size, height, width, device, dtype):
+        latent_image_ids = torch.zeros(height, width, 3)
+        latent_image_ids[..., 1] = latent_image_ids[..., 1] + torch.arange(height)[:, None]
+        latent_image_ids[..., 2] = latent_image_ids[..., 2] + torch.arange(width)[None, :]
+        latent_image_id_height, latent_image_id_width, latent_image_id_channels = latent_image_ids.shape
+        latent_image_ids = latent_image_ids.reshape(
+            latent_image_id_height * latent_image_id_width, latent_image_id_channels
+        )
+        return latent_image_ids.to(device=device, dtype=dtype)
+    @staticmethod
+    # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline._pack_latents
+    def _pack_latents(latents, batch_size, num_channels_latents, height, width):
+        latents = latents.view(batch_size, num_channels_latents, height // 2, 2, width // 2, 2)
+        latents = latents.permute(0, 2, 4, 1, 3, 5)
+        latents = latents.reshape(batch_size, (height // 2) * (width // 2), num_channels_latents * 4)
+        return latents
+    @staticmethod
+    # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline._unpack_latents
+    def _unpack_latents(latents, height, width, vae_scale_factor):
+        batch_size, num_patches, channels = latents.shape
+        # VAE applies 8x compression on images but we must also account for packing which requires
+        # latent height and width to be divisible by 2.
+        height = 2 * (int(height) // (vae_scale_factor * 2))
+        width = 2 * (int(width) // (vae_scale_factor * 2))
+        latents = latents.view(batch_size, height // 2, width // 2, channels // 4, 2, 2)
+        latents = latents.permute(0, 3, 1, 4, 2, 5)
+        latents = latents.reshape(batch_size, channels // (2 * 2), height, width)
+        return latents
+    def _encode_vae_image(self, image: torch.Tensor, generator: torch.Generator):
+        if isinstance(generator, list):
+            image_latents = [
+                retrieve_latents(self.vae.encode(image[i : i + 1]), generator=generator[i], sample_mode="argmax")
+                for i in range(image.shape[0])
+            ]
+            image_latents = torch.cat(image_latents, dim=0)
+        else:
+            image_latents = retrieve_latents(self.vae.encode(image), generator=generator, sample_mode="argmax")
+        image_latents = (image_latents - self.vae.config.shift_factor) * self.vae.config.scaling_factor
+        return image_latents
+    # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline.enable_vae_slicing
+    def enable_vae_slicing(self):
+        r"""
+        Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
+        compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
+        """
+        depr_message = f"Calling `enable_vae_slicing()` on a `{self.__class__.__name__}` is deprecated and this method will be removed in a future version. Please use `pipe.vae.enable_slicing()`."
+        deprecate(
+            "enable_vae_slicing",
+            "0.40.0",
+            depr_message,
+        )
+        self.vae.enable_slicing()
+    # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline.disable_vae_slicing
+    def disable_vae_slicing(self):
+        r"""
+        Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
+        computing decoding in one step.
+        """
+        depr_message = f"Calling `disable_vae_slicing()` on a `{self.__class__.__name__}` is deprecated and this method will be removed in a future version. Please use `pipe.vae.disable_slicing()`."
+        deprecate(
+            "disable_vae_slicing",
+            "0.40.0",
+            depr_message,
+        )
+        self.vae.disable_slicing()
+    # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline.enable_vae_tiling
+    def enable_vae_tiling(self):
+        r"""
+        Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
+        compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
+        processing larger images.
+        """
+        depr_message = f"Calling `enable_vae_tiling()` on a `{self.__class__.__name__}` is deprecated and this method will be removed in a future version. Please use `pipe.vae.enable_tiling()`."
+        deprecate(
+            "enable_vae_tiling",
+            "0.40.0",
+            depr_message,
+        )
+        self.vae.enable_tiling()
+    # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline.disable_vae_tiling
+    def disable_vae_tiling(self):
+        r"""
+        Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
+        computing decoding in one step.
+        """
+        depr_message = f"Calling `disable_vae_tiling()` on a `{self.__class__.__name__}` is deprecated and this method will be removed in a future version. Please use `pipe.vae.disable_tiling()`."
+        deprecate(
+            "disable_vae_tiling",
+            "0.40.0",
+            depr_message,
+        )
+        self.vae.disable_tiling()
+    def prepare_latents(
+        self,
+        image: Optional[torch.Tensor],
+        batch_size: int,
+        num_channels_latents: int,
+        height: int,
+        width: int,
+        dtype: torch.dtype,
+        device: torch.device,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+    ):
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+        # VAE applies 8x compression on images but we must also account for packing which requires
+        # latent height and width to be divisible by 2.
+        height = 2 * (int(height) // (self.vae_scale_factor * 2))
+        width = 2 * (int(width) // (self.vae_scale_factor * 2))
+        shape = (batch_size, num_channels_latents, height, width)
+        image_latents = image_ids = None
+        if image is not None:
+            image = image.to(device=device, dtype=dtype)
+            if image.shape[1] != self.latent_channels:
+                image_latents = self._encode_vae_image(image=image, generator=generator)
+            else:
+                image_latents = image
+            if batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] == 0:
+                # expand init_latents for batch_size
+                additional_image_per_prompt = batch_size // image_latents.shape[0]
+                image_latents = torch.cat([image_latents] * additional_image_per_prompt, dim=0)
+            elif batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] != 0:
+                raise ValueError(
+                    f"Cannot duplicate `image` of batch size {image_latents.shape[0]} to {batch_size} text prompts."
+                )
+            else:
+                image_latents = torch.cat([image_latents], dim=0)
+            image_latent_height, image_latent_width = image_latents.shape[2:]
+            image_latents = self._pack_latents(
+                image_latents, batch_size, num_channels_latents, image_latent_height, image_latent_width
+            )
+            image_ids = self._prepare_latent_image_ids(
+                batch_size, image_latent_height // 2, image_latent_width // 2, device, dtype
+            )
+            # image ids are the same as latent ids with the first dimension set to 1 instead of 0
+            image_ids[..., 0] = 1
+        latent_ids = self._prepare_latent_image_ids(batch_size, height // 2, width // 2, device, dtype)
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+            latents = self._pack_latents(latents, batch_size, num_channels_latents, height, width)
+        else:
+            latents = latents.to(device=device, dtype=dtype)
+        return latents, image_latents, latent_ids, image_ids
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+    @property
+    def joint_attention_kwargs(self):
+        return self._joint_attention_kwargs
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+    @property
+    def current_timestep(self):
+        return self._current_timestep
+    @property
+    def interrupt(self):
+        return self._interrupt
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        image: Optional[PipelineImageInput] = None,
+        prompt: Union[str, List[str]] = None,
+        prompt_2: Optional[Union[str, List[str]]] = None,
+        negative_prompt: Union[str, List[str]] = None,
+        negative_prompt_2: Optional[Union[str, List[str]]] = None,
+        true_cfg_scale: float = 1.0,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 28,
+        sigmas: Optional[List[float]] = None,
+        guidance_scale: float = 3.5,
+        num_images_per_prompt: Optional[int] = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
+        negative_ip_adapter_image: Optional[PipelineImageInput] = None,
+        negative_ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        joint_attention_kwargs: Optional[Dict[str, Any]] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        max_sequence_length: int = 512,
+        max_area: int = 1024**2,
+        _auto_resize: bool = True,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+        Args:
+            image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
+                `Image`, numpy array or tensor representing an image batch to be used as the starting point. For both
+                numpy array and pytorch tensor, the expected value range is between `[0, 1]` If it's a tensor or a list
+                or tensors, the expected shape should be `(B, C, H, W)` or `(C, H, W)`. If it is a numpy array or a
+                list of arrays, the expected shape should be `(B, H, W, C)` or `(H, W, C)` It can also accept image
+                latents as `image`, but if passing latents directly it is not encoded again.
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                will be used instead.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `true_cfg_scale` is
+                not greater than `1`).
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
+                `text_encoder_2`. If not defined, `negative_prompt` is used in all the text-encoders.
+            true_cfg_scale (`float`, *optional*, defaults to 1.0):
+                When > 1.0 and a provided `negative_prompt`, enables true classifier-free guidance.
+            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The height in pixels of the generated image. This is set to 1024 by default for the best results.
+            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The width in pixels of the generated image. This is set to 1024 by default for the best results.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            sigmas (`List[float]`, *optional*):
+                Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
+                their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
+                will be used.
+            guidance_scale (`float`, *optional*, defaults to 3.5):
+                Embedded guidance scale is enabled by setting `guidance_scale` > 1. Higher `guidance_scale` encourages
+                a model to generate images more aligned with prompt at the expense of lower image quality.
+                Guidance-distilled models approximates true classifier-free guidance for `guidance_scale` > 1. Refer to
+                the [paper](https://huggingface.co/papers/2210.03142) to learn more.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will be generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            ip_adapter_image: (`PipelineImageInput`, *optional*):
+                Optional image input to work with IP Adapters.
+            ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*):
+                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
+                IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. If not
+                provided, embeddings are computed from the `ip_adapter_image` input argument.
+            negative_ip_adapter_image:
+                (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
+            negative_ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*):
+                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
+                IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. If not
+                provided, embeddings are computed from the `ip_adapter_image` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.flux.FluxPipelineOutput`] instead of a plain tuple.
+            joint_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+            max_sequence_length (`int` defaults to 512):
+                Maximum sequence length to use with the `prompt`.
+            max_area (`int`, defaults to `1024 ** 2`):
+                The maximum area of the generated image in pixels. The height and width will be adjusted to fit this
+                area while maintaining the aspect ratio.
+        Examples:
+        Returns:
+            [`~pipelines.flux.FluxPipelineOutput`] or `tuple`: [`~pipelines.flux.FluxPipelineOutput`] if `return_dict`
+            is True, otherwise a `tuple`. When returning a tuple, the first element is a list with the generated
+            images.
+        """
+        height = height or self.default_sample_size * self.vae_scale_factor
+        width = width or self.default_sample_size * self.vae_scale_factor
+        original_height, original_width = height, width
+        aspect_ratio = width / height
+        width = round((max_area * aspect_ratio) ** 0.5)
+        height = round((max_area / aspect_ratio) ** 0.5)
+        multiple_of = self.vae_scale_factor * 2
+        width = width // multiple_of * multiple_of
+        height = height // multiple_of * multiple_of
+        if height != original_height or width != original_width:
+            logger.warning(
+                f"Generation `height` and `width` have been adjusted to {height} and {width} to fit the model requirements."
+            )
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            prompt_2,
+            height,
+            width,
+            negative_prompt=negative_prompt,
+            negative_prompt_2=negative_prompt_2,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+            callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
+            max_sequence_length=max_sequence_length,
+        )
+        self._guidance_scale = guidance_scale
+        self._joint_attention_kwargs = joint_attention_kwargs
+        self._current_timestep = None
+        self._interrupt = False
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        device = self._execution_device
+        lora_scale = (
+            self.joint_attention_kwargs.get("scale", None) if self.joint_attention_kwargs is not None else None
+        )
+        has_neg_prompt = negative_prompt is not None or (
+            negative_prompt_embeds is not None and negative_pooled_prompt_embeds is not None
+        )
+        do_true_cfg = true_cfg_scale > 1 and has_neg_prompt
+        (
+            prompt_embeds,
+            pooled_prompt_embeds,
+            text_ids,
+        ) = self.encode_prompt(
+            prompt=prompt,
+            prompt_2=prompt_2,
+            prompt_embeds=prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            max_sequence_length=max_sequence_length,
+            lora_scale=lora_scale,
+        )
+        if do_true_cfg:
+            (
+                negative_prompt_embeds,
+                negative_pooled_prompt_embeds,
+                negative_text_ids,
+            ) = self.encode_prompt(
+                prompt=negative_prompt,
+                prompt_2=negative_prompt_2,
+                prompt_embeds=negative_prompt_embeds,
+                pooled_prompt_embeds=negative_pooled_prompt_embeds,
+                device=device,
+                num_images_per_prompt=num_images_per_prompt,
+                max_sequence_length=max_sequence_length,
+                lora_scale=lora_scale,
+            )
+        # 3. Preprocess image
+        if image is not None and not (isinstance(image, torch.Tensor) and image.size(1) == self.latent_channels):
+            img = image[0] if isinstance(image, list) else image
+            image_height, image_width = self.image_processor.get_default_height_width(img)
+            aspect_ratio = image_width / image_height
+            if _auto_resize:
+                # Kontext is trained on specific resolutions, using one of them is recommended
+                _, image_width, image_height = min(
+                    (abs(aspect_ratio - w / h), w, h) for w, h in PREFERRED_KONTEXT_RESOLUTIONS
+                )
+            image_width = image_width // multiple_of * multiple_of
+            image_height = image_height // multiple_of * multiple_of
+            image = self.image_processor.resize(image, image_height, image_width)
+            image = self.image_processor.preprocess(image, image_height, image_width)
+        # 4. Prepare latent variables
+        num_channels_latents = self.transformer.config.in_channels // 4
+        latents, image_latents, latent_ids, image_ids = self.prepare_latents(
+            image,
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+        if image_ids is not None:
+            latent_ids = torch.cat([latent_ids, image_ids], dim=0)  # dim 0 is sequence dimension
+        # 5. Prepare timesteps
+        sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps) if sigmas is None else sigmas
+        image_seq_len = latents.shape[1]
+        mu = calculate_shift(
+            image_seq_len,
+            self.scheduler.config.get("base_image_seq_len", 256),
+            self.scheduler.config.get("max_image_seq_len", 4096),
+            self.scheduler.config.get("base_shift", 0.5),
+            self.scheduler.config.get("max_shift", 1.15),
+        )
+        timesteps, num_inference_steps = retrieve_timesteps(
+            self.scheduler,
+            num_inference_steps,
+            device,
+            sigmas=sigmas,
+            mu=mu,
+        )
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+        self._num_timesteps = len(timesteps)
+        # handle guidance
+        if self.transformer.config.guidance_embeds:
+            guidance = torch.full([1], guidance_scale, device=device, dtype=torch.float32)
+            guidance = guidance.expand(latents.shape[0])
+        else:
+            guidance = None
+        if (ip_adapter_image is not None or ip_adapter_image_embeds is not None) and (
+            negative_ip_adapter_image is None and negative_ip_adapter_image_embeds is None
+        ):
+            negative_ip_adapter_image = np.zeros((width, height, 3), dtype=np.uint8)
+            negative_ip_adapter_image = [negative_ip_adapter_image] * self.transformer.encoder_hid_proj.num_ip_adapters
+        elif (ip_adapter_image is None and ip_adapter_image_embeds is None) and (
+            negative_ip_adapter_image is not None or negative_ip_adapter_image_embeds is not None
+        ):
+            ip_adapter_image = np.zeros((width, height, 3), dtype=np.uint8)
+            ip_adapter_image = [ip_adapter_image] * self.transformer.encoder_hid_proj.num_ip_adapters
+        if self.joint_attention_kwargs is None:
+            self._joint_attention_kwargs = {}
+        image_embeds = None
+        negative_image_embeds = None
+        if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
+            image_embeds = self.prepare_ip_adapter_image_embeds(
+                ip_adapter_image,
+                ip_adapter_image_embeds,
+                device,
+                batch_size * num_images_per_prompt,
+            )
+        if negative_ip_adapter_image is not None or negative_ip_adapter_image_embeds is not None:
+            negative_image_embeds = self.prepare_ip_adapter_image_embeds(
+                negative_ip_adapter_image,
+                negative_ip_adapter_image_embeds,
+                device,
+                batch_size * num_images_per_prompt,
+            )
+        # 6. Denoising loop
+        # We set the index here to remove DtoH sync, helpful especially during compilation.
+        # Check out more details here: https://github.com/huggingface/diffusers/pull/11696
+        self.scheduler.set_begin_index(0)
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+                self._current_timestep = t
+                if image_embeds is not None:
+                    self._joint_attention_kwargs["ip_adapter_image_embeds"] = image_embeds
+                latent_model_input = latents
+                if image_latents is not None:
+                    latent_model_input = torch.cat([latents, image_latents], dim=1)
+                timestep = t.expand(latents.shape[0]).to(latents.dtype)
+                noise_pred = self.transformer(
+                    hidden_states=latent_model_input,
+                    timestep=timestep / 1000,
+                    guidance=guidance,
+                    pooled_projections=pooled_prompt_embeds,
+                    encoder_hidden_states=prompt_embeds,
+                    txt_ids=text_ids,
+                    img_ids=latent_ids,
+                    joint_attention_kwargs=self.joint_attention_kwargs,
+                    return_dict=False,
+                )[0]
+                noise_pred = noise_pred[:, : latents.size(1)]
+                if do_true_cfg:
+                    if negative_image_embeds is not None:
+                        self._joint_attention_kwargs["ip_adapter_image_embeds"] = negative_image_embeds
+                    neg_noise_pred = self.transformer(
+                        hidden_states=latent_model_input,
+                        timestep=timestep / 1000,
+                        guidance=guidance,
+                        pooled_projections=negative_pooled_prompt_embeds,
+                        encoder_hidden_states=negative_prompt_embeds,
+                        txt_ids=negative_text_ids,
+                        img_ids=latent_ids,
+                        joint_attention_kwargs=self.joint_attention_kwargs,
+                        return_dict=False,
+                    )[0]
+                    neg_noise_pred = neg_noise_pred[:, : latents.size(1)]
+                    noise_pred = neg_noise_pred + true_cfg_scale * (noise_pred - neg_noise_pred)
+                # compute the previous noisy sample x_t -> x_t-1
+                latents_dtype = latents.dtype
+                latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
+                if latents.dtype != latents_dtype:
+                    if torch.backends.mps.is_available():
+                        # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
+                        latents = latents.to(latents_dtype)
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                if XLA_AVAILABLE:
+                    xm.mark_step()
+        self._current_timestep = None
+        if output_type == "latent":
+            image = latents
+        else:
+            latents = self._unpack_latents(latents, height, width, self.vae_scale_factor)
+            latents = (latents / self.vae.config.scaling_factor) + self.vae.config.shift_factor
+            image = self.vae.decode(latents, return_dict=False)[0]
+            image = self.image_processor.postprocess(image, output_type=output_type)
+        # Offload all models
+        self.maybe_free_model_hooks()
+        if not return_dict:
+            return (image,)
+        return FluxPipelineOutput(images=image)

unimodel/qwenkontext/qwenkontext_inference.py ADDED Viewed

	@@ -0,0 +1,442 @@

+# Copyright 2025 Fu-Yun Wang
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import List, Optional, Tuple, Union, Dict
+import torch
+import torch.nn as nn
+from PIL import Image
+import torch.nn.functional as F
+from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, AutoProcessor
+from transformers import Qwen2_5_VLConfig, Qwen2_5_VLModel, Qwen2_5_VLForConditionalGeneration
+from qwen_vl_utils import process_vision_info
+import torchvision.transforms as transforms
+from diffusers.utils.torch_utils import randn_tensor
+from diffusers.pipelines.pipeline_utils import numpy_to_pil
+import numpy as np
+from diffusers.schedulers.scheduling_flow_match_euler_discrete import FlowMatchEulerDiscreteScheduler, FlowMatchEulerDiscreteSchedulerOutput
+from diffusers.schedulers import DPMSolverMultistepScheduler
+import math
+from diffusers.utils.torch_utils import randn_tensor
+from diffusers import FluxTransformer2DModel, AutoencoderKL, FlowMatchEulerDiscreteScheduler #, FluxKontextPipeline
+from .fluxkontext_pipeline import FluxKontextPipeline
+from transformers import CLIPTextModel, CLIPTokenizer, T5EncoderModel, T5TokenizerFast, CLIPTextConfig, T5Config
+import re
+import datetime
+import os
+def save_grid_image(prompt, images, rows, cols):
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    base_dir = os.path.join("samples", timestamp, prompt[:100])
+    os.makedirs(base_dir, exist_ok=True)
+    filename = os.path.join(base_dir, "grid.jpg")
+    grid_image = create_image_grid(images, rows, cols)
+    grid_image.save(filename)
+    print(f"Saved: {filename}")
+def create_image_grid(images, rows, cols):
+    """Creates a grid of images and returns a single PIL Image."""
+    assert len(images) == rows * cols
+    width, height = images[0].size
+    grid_width = width * cols
+    grid_height = height * rows
+    grid_image = Image.new('RGB', (grid_width, grid_height))
+    for i, image in enumerate(images):
+        x = (i % cols) * width
+        y = (i // cols) * height
+        grid_image.paste(image, (x, y))
+    return grid_image
+def sde_step_with_logprob(
+    self: FlowMatchEulerDiscreteScheduler,
+    model_output: torch.FloatTensor,
+    timestep: Union[float, torch.FloatTensor],
+    sample: torch.FloatTensor,
+    prev_sample: Optional[torch.FloatTensor] = None,
+    generator: Optional[torch.Generator] = None,
+) -> Union[FlowMatchEulerDiscreteSchedulerOutput, Tuple]:
+    """
+    Predict the sample from the previous timestep by reversing the SDE. This function propagates the flow
+    process from the learned model outputs (most often the predicted velocity).
+    Args:
+        model_output (`torch.FloatTensor`):
+            The direct output from learned flow model.
+        timestep (`float`):
+            The current discrete timestep in the diffusion chain.
+        sample (`torch.FloatTensor`):
+            A current instance of a sample created by the diffusion process.
+        generator (`torch.Generator`, *optional*):
+            A random number generator.
+    """
+    step_index = [self.index_for_timestep(t) for t in timestep]
+    prev_step_index = [step+1 for step in step_index]
+    sigma = self.sigmas[step_index].view(-1, 1, 1).to(model_output.device)
+    sigma_prev = self.sigmas[prev_step_index].view(-1, 1, 1).to(model_output.device)
+    sigma_max = self.sigmas[1].item()
+    dt = sigma_prev - sigma
+    std_dev_t = torch.sqrt(sigma / (1 - torch.where(sigma == 1, sigma_max, sigma)))*1.0
+    # our sde
+    prev_sample_mean = sample*(1+std_dev_t**2/(2*sigma)*dt)+model_output*(1+std_dev_t**2*(1-sigma)/(2*sigma))*dt
+    if prev_sample is not None and generator is not None:
+        raise ValueError(
+            "Cannot pass both generator and prev_sample. Please make sure that either `generator` or"
+            " `prev_sample` stays `None`."
+        )
+    if prev_sample is None:
+        variance_noise = randn_tensor(
+            model_output.shape,
+            generator=generator,
+            device=model_output.device,
+            dtype=model_output.dtype,
+        )
+        prev_sample = prev_sample_mean + std_dev_t * torch.sqrt(-1*dt) * variance_noise
+    log_prob = (
+        -((prev_sample.detach() - prev_sample_mean) ** 2) / (2 * ((std_dev_t * torch.sqrt(-1*dt))**2))
+        - torch.log(std_dev_t * torch.sqrt(-1*dt))
+        - torch.log(torch.sqrt(2 * torch.as_tensor(math.pi)))
+    )
+    # mean along all but batch dimension
+    log_prob = log_prob.mean(dim=tuple(range(1, log_prob.ndim)))
+    return prev_sample, log_prob, prev_sample_mean, std_dev_t * torch.sqrt(-1*dt)
+# Copyright 2025 Fu-Yun Wang
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+def sde_step_with_logprob_simple(
+    self: FlowMatchEulerDiscreteScheduler,
+    model_output: torch.FloatTensor,
+    timestep: Union[float, torch.FloatTensor],
+    sample: torch.FloatTensor,
+    prev_sample: Optional[torch.FloatTensor] = None,
+    generator: Optional[torch.Generator] = None,
+):
+    """
+    Predict the sample from the previous timestep by reversing the SDE. This function propagates the flow
+    process from the learned model outputs (most often the predicted velocity).
+    Args:
+        model_output (`torch.FloatTensor`):
+            The direct output from learned flow model.
+        timestep (`float`):
+            The current discrete timestep in the diffusion chain.
+        sample (`torch.FloatTensor`):
+            A current instance of a sample created by the diffusion process.
+        generator (`torch.Generator`, *optional*):
+            A random number generator.
+    """
+    step_index = [self.index_for_timestep(t) for t in timestep]
+    prev_step_index = [step+1 for step in step_index]
+    sigma = self.sigmas[step_index].view(-1, 1, 1, 1).to(model_output.device)
+    sigma_prev = self.sigmas[prev_step_index].view(-1, 1, 1, 1).to(model_output.device)
+    sigma_max = self.sigmas[1].item()
+    dt = sigma_prev - sigma
+    eta = 0.5
+    Dt = - dt * eta
+    prev_sample_mean = sample * (1 - Dt / (1 - torch.where(sigma == 1, sigma_max, sigma))) + model_output * (dt - Dt)
+    std_dev_t = torch.sqrt(2 * Dt * (sigma / (1 - torch.where(sigma == 1, sigma_max, sigma))))
+    if prev_sample is not None and generator is not None:
+        raise ValueError(
+            "Cannot pass both generator and prev_sample. Please make sure that either `generator` or"
+            " `prev_sample` stays `None`."
+        )
+    if prev_sample is None:
+        # Generate noise if not provided
+        variance_noise = randn_tensor(
+            model_output.shape,
+            generator=generator,
+            device=model_output.device,
+            dtype=model_output.dtype,
+        )
+        prev_sample = prev_sample_mean + std_dev_t * variance_noise
+    log_prob = (
+        -((prev_sample.detach() - prev_sample_mean) ** 2) / (2 * (std_dev_t**2))
+        - torch.log(std_dev_t)
+        - torch.log(torch.sqrt(2 * torch.as_tensor(math.pi)))
+    )
+    # mean along all but batch dimension
+    log_prob = log_prob.mean(dim=tuple(range(1, log_prob.ndim)))
+    return prev_sample, log_prob, prev_sample_mean, std_dev_t
+class QwenKontextMetaModel:
+    def __init__(self, config):
+        super(QwenKontextMetaModel, self).__init__(config)
+        if hasattr(config, "diffusion_expert"):
+            ckpt_id = "black-forest-labs/FLUX.1-Kontext-dev"
+            # Load configuration for each component
+            transformer_config = FluxTransformer2DModel.load_config(ckpt_id, subfolder="transformer")
+            vae_config = AutoencoderKL.load_config(ckpt_id, subfolder="vae")
+            text_encoder_config = CLIPTextConfig.from_pretrained(ckpt_id, subfolder="text_encoder")
+            text_encoder_2_config = T5Config.from_pretrained(ckpt_id, subfolder="text_encoder_2")
+            # Initialize components from their configurations
+            self.transformer = FluxTransformer2DModel.from_config(transformer_config)
+            self.vae = AutoencoderKL.from_config(vae_config)
+            self.text_encoder = CLIPTextModel(text_encoder_config)
+            self.text_encoder_2 = T5EncoderModel(text_encoder_2_config)
+            # Initialize tokenizers (these don't use from_config as they are not models)
+            self.tokenizer = CLIPTokenizer.from_pretrained(ckpt_id, subfolder="tokenizer")
+            self.tokenizer_2 = T5TokenizerFast.from_pretrained(ckpt_id, subfolder="tokenizer_2")
+            self.scheduler = FlowMatchEulerDiscreteScheduler.from_pretrained(ckpt_id, subfolder="scheduler")
+            # Create the pipeline configuration dictionary
+            pipeline_config = {
+                "transformer": self.transformer,
+                "scheduler": self.scheduler,
+                "vae": self.vae,
+                "text_encoder": self.text_encoder,
+                "text_encoder_2": self.text_encoder_2,
+                "tokenizer": self.tokenizer,
+                "tokenizer_2": self.tokenizer_2,
+            }
+            self.diffusion_expert = FluxKontextPipeline(**pipeline_config)
+    def initialize_diffusion_expert(self, fsdp=None):
+        if getattr(self, 'diffusion_expert', None) is None:
+            print("random initiation the diffusion expert !!!")
+            self.diffusion_expert = FluxKontextPipeline.from_pretrained("black-forest-labs/FLUX.1-Kontext-dev", revision="main", torch_dtype=torch.bfloat16).to(torch.bfloat16)
+            self.text_encoder = self.diffusion_expert.text_encoder
+            self.text_encoder_2 = self.diffusion_expert.text_encoder_2
+            self.tokenizer = self.diffusion_expert.tokenizer
+            self.tokenizer_2 = self.diffusion_expert.tokenizer_2
+            self.vae = self.diffusion_expert.vae
+            self.transformer = self.diffusion_expert.transformer
+            self.scheduler = self.diffusion_expert.scheduler
+            self.config.diffusion_expert =  "flux"
+class QwenKontextConfig(Qwen2_5_VLConfig):
+    model_type = "QwenKontext"
+class QwenKontextModel(QwenKontextMetaModel, Qwen2_5_VLModel):
+    config_class = QwenKontextConfig
+    def __init__(self, config: Qwen2_5_VLConfig):
+        super(QwenKontextModel, self).__init__(config)
+class QwenKontextForInferenceLM(Qwen2_5_VLForConditionalGeneration):
+    config_class = QwenKontextConfig
+    def __init__(self, config):
+        Qwen2_5_VLForConditionalGeneration.__init__(self, config)
+        config.model_type = "QwenKontext"
+        self.model = QwenKontextModel(config)
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        self.post_init()
+    def get_model(self):
+        return self.model
+    @torch.no_grad()
+    def generate_image(
+        self,
+        images: List[Image.Image],
+        texts: List[str],
+        diffusion_kwargs: Optional[Dict] = dict(guidance_scale = 3.5, num_inference_steps=25),
+        sde_sampling: Optional[bool] = False,
+    ):
+        if isinstance(texts, str):
+            texts = [texts]
+        if not sde_sampling:
+            output_img = self.model.diffusion_expert(
+                images,
+                texts,
+                max_sequence_length=512,
+                **diffusion_kwargs,
+            ).images
+            return output_img
+        else:
+            return self.model.diffusion_expert.sde_sampling(
+                images,
+                texts,
+                max_sequence_length=512,
+                **diffusion_kwargs,
+            )
+    def extract_thinking_content(self, text: str) -> str:
+        pattern = r'<answer>(.*?)</answer>'
+        matches = re.findall(pattern, text, re.DOTALL)
+        if matches:
+            return matches[-1].strip().replace("<answer>", "").replace("</answer>", "")
+        else:
+            return text.strip().replace("<answer>", "").replace("</answer>", "")
+    @torch.no_grad()
+    def generate_image_cot(
+        self,
+        images: List[Image.Image],
+        texts: List[str],
+        processor: Optional[object] = None,
+        diffusion_kwargs: Optional[Dict] = dict(guidance_scale = 2.5, num_inference_steps=25),
+        llm_kwargs: Optional[Dict] = dict(max_new_tokens=256, temperature=0.7, top_p=0.9, do_sample=True),
+        cot_prompt_template: Optional[str] = None,
+    ):
+        if isinstance(texts, str):
+            texts = [texts]
+        if cot_prompt_template is None:
+            cot_prompt_template = """Please provide an enhanced prompt for the following image editing prompt.
+            Ensure the revised prompt is clear, specific, and includes detailed instructions to achieve the desired outcome while maintaining the original intent.
+            Original prompt: {original_prompt}. Directly provide the improved prompt in <answer> </answer> tags."""
+        improved_prompts = []
+        for text, image in zip(texts, images):
+            cot_input = cot_prompt_template.format(original_prompt=text)
+            messages = [
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "image",
+                            "image": image,
+                        },
+                        {"type": "text", "text": cot_input},
+                    ],
+                }
+            ]
+            input_text_formatted = processor.apply_chat_template(
+                messages, tokenize=False, add_generation_prompt=True
+            )
+            image_inputs, video_inputs = process_vision_info(messages)
+            model_inputs = processor(
+                images=image_inputs,
+                text=[input_text_formatted],
+                return_tensors="pt"
+            ).to(self.device)
+            generated_ids = self.generate(
+                **model_inputs,
+                **llm_kwargs,
+                eos_token_id=processor.tokenizer.eos_token_id,
+                pad_token_id=processor.tokenizer.pad_token_id
+            )
+            generated_text = processor.batch_decode(
+                generated_ids[:, model_inputs['input_ids'].shape[1]:],
+                skip_special_tokens=True
+            )
+            improved_prompt = [self.extract_thinking_content(decode_text) for decode_text in generated_text]
+            improved_prompts.extend(improved_prompt)
+            print(f"Original prompt: {text}")
+            print(f"Improved prompt: {improved_prompt}")
+            print("-" * 50)
+        output_images = self.generate_image(images, improved_prompts, diffusion_kwargs)
+        return {
+            'ref_images': images,
+            'images': output_images,
+            'original_prompts': texts,
+            'improved_prompts': improved_prompts
+        }
+AutoConfig.register("QwenKontext", QwenKontextConfig)
+AutoModelForCausalLM.register(QwenKontextConfig, QwenKontextForInferenceLM)
+if __name__ == "__main__":
+    model = QwenKontextForInferenceLM.from_pretrained("Qwen/Qwen2.5-VL-3B-Instruct",torch_dtype=torch.bfloat16)
+    model.model.initialize_diffusion_expert()
+    model.model.diffusion_expert.to("cuda:0")
+    model.to("cuda:0")
+    AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-3B-Instruct")
+    text = ["add a hat to him"]
+    ref_image = [Image.open("assets/images/cat.jpg").convert("RGB")]
+    images = model.generate_image(ref_image, text)
+    images[0].save("test_flux.jpg")
+    model.save_pretrained("outputs/pretrain/qwenkontext")
+    # model = QwenKontextForInferenceLM.from_pretrained("outputs/pretrain/qwenkontext", torch_dtype=torch.bfloat16)
+    # model.to("cuda:0")
+    # transform = transforms.Compose([
+    #     transforms.Resize(512, interpolation=transforms.InterpolationMode.BILINEAR),  # Shortest side to 512
+    #     transforms.CenterCrop((512, 512))  # Center crop to 512x512
+    # ])
+    # processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-3B-Instruct")
+    # text = ["add a hat to him"]
+    # ref_image = [transform(Image.open("assets/images/cat.jpg").convert("RGB"))]
+    # ref_image[0].save("ref.jpg")
+    # images = model.generate_image(ref_image, text)
+    # images[0].save("test_flux.jpg")
+    # outputs = model.generate_image_cot(ref_image, text, processor = processor)
+    # outputs['images'][0].save("test_flux_cot.jpg")

unimodel/qwensana/qwensana_inference.py ADDED Viewed

	@@ -0,0 +1,310 @@

+# Copyright 2025 Fu-Yun Wang
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import List, Optional, Tuple, Union, Dict
+import torch
+import torch.nn as nn
+from PIL import Image
+import torch.nn.functional as F
+from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, AutoProcessor
+from transformers import Qwen2_5_VLConfig, Qwen2_5_VLModel, Qwen2_5_VLForConditionalGeneration, T5Config, Gemma2Model, GemmaTokenizer, GemmaTokenizerFast, Gemma2Config, AutoConfig
+from diffusers import SanaPipeline, AutoencoderDC, FlowMatchEulerDiscreteScheduler, SanaTransformer2DModel, DPMSolverMultistepScheduler
+import re
+import datetime
+import os
+def save_grid_image(prompt, images, rows, cols):
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    base_dir = os.path.join("samples", timestamp, prompt[:100])
+    os.makedirs(base_dir, exist_ok=True)
+    filename = os.path.join(base_dir, "grid.jpg")
+    grid_image = create_image_grid(images, rows, cols)
+    grid_image.save(filename)
+    print(f"Saved: {filename}")
+def create_image_grid(images, rows, cols):
+    """Creates a grid of images and returns a single PIL Image."""
+    assert len(images) == rows * cols
+    width, height = images[0].size
+    grid_width = width * cols
+    grid_height = height * rows
+    grid_image = Image.new('RGB', (grid_width, grid_height))
+    for i, image in enumerate(images):
+        x = (i % cols) * width
+        y = (i // cols) * height
+        grid_image.paste(image, (x, y))
+    return grid_image
+class QwenSanaMetaModel:
+    def __init__(self, config):
+        super(QwenSanaMetaModel, self).__init__(config)
+        if hasattr(config, "diffusion_expert"):
+            ckpt_id = "Efficient-Large-Model/SANA1.5_1.6B_1024px_diffusers"
+            # Load configuration for each component
+            transformer_config = SanaTransformer2DModel.load_config(ckpt_id, subfolder="transformer")
+            vae_config = AutoencoderDC.load_config(ckpt_id, subfolder="vae")
+            text_encoder_config = Gemma2Config.from_pretrained(ckpt_id, subfolder="text_encoder")
+            scheduler_config = DPMSolverMultistepScheduler.load_config(ckpt_id, subfolder="scheduler")
+            # Initialize components from their configurations
+            self.transformer = SanaTransformer2DModel.from_config(transformer_config)
+            self.vae = AutoencoderDC.from_config(vae_config)
+            self.text_encoder = Gemma2Model(text_encoder_config)
+            self.scheduler = DPMSolverMultistepScheduler.from_config(scheduler_config)
+            # Initialize tokenizer
+            self.tokenizer = GemmaTokenizerFast.from_pretrained(ckpt_id, subfolder="tokenizer")
+            # Create the pipeline configuration dictionary
+            pipeline_config = {
+                "transformer": self.transformer,
+                "scheduler": self.scheduler,
+                "vae": self.vae,
+                "text_encoder": self.text_encoder,
+                "tokenizer": self.tokenizer,
+            }
+            self.diffusion_expert = SanaPipeline(**pipeline_config)
+    def initialize_diffusion_expert(self, fsdp=None):
+        if getattr(self, 'diffusion_expert', None) is None:
+            print("Random initiation the Sana diffusion expert !!!")
+            self.diffusion_expert = SanaPipeline.from_pretrained(
+                "Efficient-Large-Model/SANA1.5_1.6B_1024px_diffusers",
+                torch_dtype=torch.bfloat16
+            )
+            # Store references to components for easier access
+            self.transformer = self.diffusion_expert.transformer
+            self.vae = self.diffusion_expert.vae
+            self.text_encoder = self.diffusion_expert.text_encoder
+            self.tokenizer = self.diffusion_expert.tokenizer
+            self.scheduler = self.diffusion_expert.scheduler
+            self.config.diffusion_expert = "Sana"
+class QwenSanaConfig(Qwen2_5_VLConfig):
+    model_type = "QwenSana"
+class QwenSanaModel(QwenSanaMetaModel, Qwen2_5_VLModel):
+    config_class = QwenSanaConfig
+    def __init__(self, config: Qwen2_5_VLConfig):
+        super(QwenSanaModel, self).__init__(config)
+class QwenSanaForInferenceLM(Qwen2_5_VLForConditionalGeneration):
+    config_class = QwenSanaConfig
+    def __init__(self, config):
+        Qwen2_5_VLForConditionalGeneration.__init__(self, config)
+        config.model_type = "QwenSana"
+        self.model = QwenSanaModel(config)
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        self.post_init()
+    def get_model(self):
+        return self.model
+    @torch.no_grad()
+    def generate_image(
+        self,
+        texts: List[str],
+        diffusion_kwargs: Optional[Dict] = None,
+    ):
+        if isinstance(texts, str):
+            texts = [texts]
+        # Default parameters for Sana
+        default_kwargs = dict(
+            guidance_scale=3.5,
+            num_inference_steps=20,
+            height=1024,
+            width=1024
+        )
+        if diffusion_kwargs:
+            default_kwargs.update(diffusion_kwargs)
+        output_img = self.model.diffusion_expert(
+            texts,
+            **default_kwargs,
+        ).images
+        return output_img
+    def extract_thinking_content(self, text: str) -> str:
+        pattern = r'<answer>(.*?)</answer>'
+        matches = re.findall(pattern, text, re.DOTALL)
+        if matches:
+            return matches[-1].strip().replace("<answer>", "").replace("</answer>", "")
+        else:
+            return text.strip().replace("<answer>", "").replace("</answer>", "")
+    @torch.no_grad()
+    def generate_image_cot(
+        self,
+        texts: List[str],
+        processor: Optional[object] = None,
+        diffusion_kwargs: Optional[Dict] = None,
+        llm_kwargs: Optional[Dict] = None,
+        cot_prompt_template: Optional[str] = None,
+    ):
+        if isinstance(texts, str):
+            texts = [texts]
+        # Default parameters
+        default_diffusion_kwargs = dict(
+            guidance_scale=5.0,
+            num_inference_steps=20,
+            height=1024,
+            width=1024
+        )
+        if diffusion_kwargs:
+            default_diffusion_kwargs.update(diffusion_kwargs)
+        default_llm_kwargs = dict(
+            max_new_tokens=256,
+            temperature=0.7,
+            top_p=0.9,
+            do_sample=True
+        )
+        if llm_kwargs:
+            default_llm_kwargs.update(llm_kwargs)
+        if cot_prompt_template is None:
+            cot_prompt_template = """Please provide an enhanced prompt for the following image generation prompt to make the image more realistic, detailed, with clear separation and precise alignment of all entities.
+            Original prompt: {original_prompt}. Directly provide the improved prompt in <answer> </answer> tags."""
+        improved_prompts = []
+        for text in texts:
+            cot_input = cot_prompt_template.format(original_prompt=text)
+            messages = [{"role": "user", "content": cot_input}]
+            input_text_formatted = processor.apply_chat_template(
+                messages, tokenize=False, add_generation_prompt=True
+            )
+            model_inputs = processor(
+                text=[input_text_formatted],
+                return_tensors="pt"
+            ).to(self.device)
+            generated_ids = self.generate(
+                **model_inputs,
+                **default_llm_kwargs,
+                eos_token_id=processor.tokenizer.eos_token_id,
+                pad_token_id=processor.tokenizer.pad_token_id
+            )
+            generated_text = processor.batch_decode(
+                generated_ids[:, model_inputs['input_ids'].shape[1]:],
+                skip_special_tokens=True
+            )
+            improved_prompt = [self.extract_thinking_content(decode_text) for decode_text in generated_text]
+            improved_prompts.extend(improved_prompt)
+            print(f"Original prompt: {text}")
+            print(f"Improved prompt: {improved_prompt}")
+            print("-" * 50)
+        output_images = self.generate_image(improved_prompts, default_diffusion_kwargs)
+        return {
+            'images': output_images,
+            'original_prompts': texts,
+            'improved_prompts': improved_prompts
+        }
+AutoConfig.register("QwenSana", QwenSanaConfig)
+AutoModelForCausalLM.register(QwenSanaConfig, QwenSanaForInferenceLM)
+if __name__ == "__main__":
+    model = QwenSanaForInferenceLM.from_pretrained(
+        "Qwen/Qwen2.5-VL-3B-Instruct",
+        torch_dtype=torch.bfloat16
+    )
+    model.model.initialize_diffusion_expert()
+    model.model.diffusion_expert.to("cuda:0")
+    model.to("cuda:0")
+    processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-3B-Instruct")
+    # Test basic image generation
+    text = ["a photo of a cat"]
+    diffusion_kwargs = dict(
+        guidance_scale=3.5,
+        num_inference_steps=20,
+        width=1024,
+        height=1024,
+        generator=torch.manual_seed(0)
+    )
+    images = model.generate_image(text, diffusion_kwargs=diffusion_kwargs)
+    images[0].save("test_Sana.jpg")
+    # Test chain-of-thought image generation
+    outputs = model.generate_image_cot(text, processor=processor, diffusion_kwargs=diffusion_kwargs)
+    outputs['images'][0].save("test_Sana_cot.jpg")
+    # Save the model
+    model.save_pretrained("outputs/pretrain/qwenSana-1.5")
+    # print("Sana model integration completed successfully!")
+    # model = QwenSanaForInferenceLM.from_pretrained(
+    # "outputs/pretrain/qwenSana-1.5",
+    # torch_dtype=torch.bfloat16
+    # ).to("cuda")
+    # processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-3B-Instruct")
+    # # Test basic image generation
+    # text = ["a photo of a cat"]
+    # diffusion_kwargs = dict(
+    #     guidance_scale=5.0,
+    #     num_inference_steps=20,
+    #     width=1024,
+    #     height=1024,
+    #     generator=torch.manual_seed(0)
+    # )
+    # images = model.generate_image(text, diffusion_kwargs=diffusion_kwargs)
+    # images[0].save("test_Sana.jpg")
+    # # Test chain-of-thought image generation
+    # outputs = model.generate_image_cot(text, processor=processor, diffusion_kwargs=diffusion_kwargs)
+    # outputs['images'][0].save("test_Sana_cot.jpg")

unimodel/qwensd3/qwensd3_inference.py ADDED Viewed

	@@ -0,0 +1,447 @@

+# Copyright 2025 Fu-Yun Wang
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import List, Optional, Tuple, Union, Dict
+import torch
+import torch.nn as nn
+from PIL import Image
+import torch.nn.functional as F
+from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, AutoProcessor
+from transformers import Qwen2_5_VLConfig, Qwen2_5_VLModel, Qwen2_5_VLForConditionalGeneration
+from diffusers.utils.torch_utils import randn_tensor
+from diffusers.pipelines.pipeline_utils import numpy_to_pil
+import numpy as np
+from diffusers.schedulers.scheduling_flow_match_euler_discrete import FlowMatchEulerDiscreteScheduler, FlowMatchEulerDiscreteSchedulerOutput
+from diffusers.schedulers import DPMSolverMultistepScheduler
+import math
+from diffusers.utils.torch_utils import randn_tensor
+from diffusers import SD3Transformer2DModel, AutoencoderKL, FlowMatchEulerDiscreteScheduler
+from transformers import CLIPTextModel, CLIPTokenizer, T5EncoderModel, T5TokenizerFast, CLIPTextConfig, T5Config, CLIPTextModelWithProjection
+try:
+    from .sd3pipeline import StableDiffusion3Pipeline as SD3Pipeline
+except:
+    from sd3pipeline import StableDiffusion3Pipeline as SD3Pipeline
+# from diffusers import StableDiffusion3Pipeline as SD3Pipeline
+import re
+import datetime
+import os
+from transformers import GenerationConfig
+def save_grid_image(prompt, images, rows, cols):
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    base_dir = os.path.join("samples", timestamp, prompt[:100])
+    os.makedirs(base_dir, exist_ok=True)
+    filename = os.path.join(base_dir, "grid.jpg")
+    grid_image = create_image_grid(images, rows, cols)
+    grid_image.save(filename)
+    print(f"Saved: {filename}")
+def create_image_grid(images, rows, cols):
+    """Creates a grid of images and returns a single PIL Image."""
+    assert len(images) == rows * cols
+    width, height = images[0].size
+    grid_width = width * cols
+    grid_height = height * rows
+    grid_image = Image.new('RGB', (grid_width, grid_height))
+    for i, image in enumerate(images):
+        x = (i % cols) * width
+        y = (i // cols) * height
+        grid_image.paste(image, (x, y))
+    return grid_image
+def sde_step_with_logprob(
+    self: FlowMatchEulerDiscreteScheduler,
+    model_output: torch.FloatTensor,
+    timestep: Union[float, torch.FloatTensor],
+    sample: torch.FloatTensor,
+    prev_sample: Optional[torch.FloatTensor] = None,
+    generator: Optional[torch.Generator] = None,
+) -> Union[FlowMatchEulerDiscreteSchedulerOutput, Tuple]:
+    """
+    Predict the sample from the previous timestep by reversing the SDE. This function propagates the flow
+    process from the learned model outputs (most often the predicted velocity).
+    Args:
+        model_output (`torch.FloatTensor`):
+            The direct output from learned flow model.
+        timestep (`float`):
+            The current discrete timestep in the diffusion chain.
+        sample (`torch.FloatTensor`):
+            A current instance of a sample created by the diffusion process.
+        generator (`torch.Generator`, *optional*):
+            A random number generator.
+    """
+    step_index = [self.index_for_timestep(t) for t in timestep]
+    prev_step_index = [step+1 for step in step_index]
+    sigma = self.sigmas[step_index].view(-1, 1, 1, 1).to(model_output.device)
+    sigma_prev = self.sigmas[prev_step_index].view(-1, 1, 1, 1).to(model_output.device)
+    sigma_max = self.sigmas[1].item()
+    dt = sigma_prev - sigma
+    std_dev_t = torch.sqrt(sigma / (1 - torch.where(sigma == 1, sigma_max, sigma)))*0.7
+    # our sde
+    prev_sample_mean = sample*(1+std_dev_t**2/(2*sigma)*dt)+model_output*(1+std_dev_t**2*(1-sigma)/(2*sigma))*dt
+    if prev_sample is not None and generator is not None:
+        raise ValueError(
+            "Cannot pass both generator and prev_sample. Please make sure that either `generator` or"
+            " `prev_sample` stays `None`."
+        )
+    if prev_sample is None:
+        variance_noise = randn_tensor(
+            model_output.shape,
+            generator=generator,
+            device=model_output.device,
+            dtype=model_output.dtype,
+        )
+        prev_sample = prev_sample_mean + std_dev_t * torch.sqrt(-1*dt) * variance_noise
+    log_prob = (
+        -((prev_sample.detach() - prev_sample_mean) ** 2) / (2 * ((std_dev_t * torch.sqrt(-1*dt))**2))
+        - torch.log(std_dev_t * torch.sqrt(-1*dt))
+        - torch.log(torch.sqrt(2 * torch.as_tensor(math.pi)))
+    )
+    # mean along all but batch dimension
+    log_prob = log_prob.mean(dim=tuple(range(1, log_prob.ndim)))
+    return prev_sample, log_prob, prev_sample_mean, std_dev_t * torch.sqrt(-1*dt)
+# Copyright 2025 Fu-Yun Wang
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+def sde_step_with_logprob_simple(
+    self: FlowMatchEulerDiscreteScheduler,
+    model_output: torch.FloatTensor,
+    timestep: Union[float, torch.FloatTensor],
+    sample: torch.FloatTensor,
+    prev_sample: Optional[torch.FloatTensor] = None,
+    generator: Optional[torch.Generator] = None,
+):
+    """
+    Predict the sample from the previous timestep by reversing the SDE. This function propagates the flow
+    process from the learned model outputs (most often the predicted velocity).
+    Args:
+        model_output (`torch.FloatTensor`):
+            The direct output from learned flow model.
+        timestep (`float`):
+            The current discrete timestep in the diffusion chain.
+        sample (`torch.FloatTensor`):
+            A current instance of a sample created by the diffusion process.
+        generator (`torch.Generator`, *optional*):
+            A random number generator.
+    """
+    step_index = [self.index_for_timestep(t) for t in timestep]
+    prev_step_index = [step+1 for step in step_index]
+    sigma = self.sigmas[step_index].view(-1, 1, 1, 1).to(model_output.device)
+    sigma_prev = self.sigmas[prev_step_index].view(-1, 1, 1, 1).to(model_output.device)
+    sigma_max = self.sigmas[1].item()
+    dt = sigma_prev - sigma
+    eta = 0.5
+    Dt = - dt * eta
+    prev_sample_mean = sample * (1 - Dt / (1 - torch.where(sigma == 1, sigma_max, sigma))) + model_output * (dt - Dt)
+    std_dev_t = torch.sqrt(2 * Dt * (sigma / (1 - torch.where(sigma == 1, sigma_max, sigma))))
+    if prev_sample is not None and generator is not None:
+        raise ValueError(
+            "Cannot pass both generator and prev_sample. Please make sure that either `generator` or"
+            " `prev_sample` stays `None`."
+        )
+    if prev_sample is None:
+        # Generate noise if not provided
+        variance_noise = randn_tensor(
+            model_output.shape,
+            generator=generator,
+            device=model_output.device,
+            dtype=model_output.dtype,
+        )
+        prev_sample = prev_sample_mean + std_dev_t * variance_noise
+    log_prob = (
+        -((prev_sample.detach() - prev_sample_mean) ** 2) / (2 * (std_dev_t**2))
+        - torch.log(std_dev_t)
+        - torch.log(torch.sqrt(2 * torch.as_tensor(math.pi)))
+    )
+    # mean along all but batch dimension
+    log_prob = log_prob.mean(dim=tuple(range(1, log_prob.ndim)))
+    return prev_sample, log_prob, prev_sample_mean, std_dev_t
+class QwenSD3MetaModel:
+    def __init__(self, config):
+        super(QwenSD3MetaModel, self).__init__(config)
+        if hasattr(config, "diffusion_expert"):
+            ckpt_id = "stabilityai/stable-diffusion-3.5-medium"
+            transformer_config = SD3Transformer2DModel.load_config(ckpt_id, subfolder="transformer")
+            vae_config = AutoencoderKL.load_config(ckpt_id, subfolder="vae")
+            text_encoder_config = CLIPTextConfig.from_pretrained(ckpt_id, subfolder="text_encoder", torch_dtype=config.torch_dtype)
+            text_encoder_2_config = CLIPTextConfig.from_pretrained(ckpt_id, subfolder="text_encoder_2", torch_dtype=config.torch_dtype)
+            text_encoder_3_config = T5Config.from_pretrained(ckpt_id, subfolder="text_encoder_3", torch_dtype=config.torch_dtype)
+            # Initialize components from their configurations
+            self.transformer = SD3Transformer2DModel.from_config(transformer_config)
+            self.vae = AutoencoderKL.from_config(vae_config)
+            self.text_encoder = CLIPTextModelWithProjection(text_encoder_config)
+            self.text_encoder_2 = CLIPTextModelWithProjection(text_encoder_2_config)
+            self.text_encoder_3 = T5EncoderModel(text_encoder_3_config)
+            # Initialize tokenizers (these don't use from_config as they are not models)
+            self.tokenizer = CLIPTokenizer.from_pretrained(ckpt_id, subfolder="tokenizer")
+            self.tokenizer_2 = CLIPTokenizer.from_pretrained(ckpt_id, subfolder="tokenizer_2")
+            self.tokenizer_3 = T5TokenizerFast.from_pretrained(ckpt_id, subfolder="tokenizer_3")
+            self.scheduler = FlowMatchEulerDiscreteScheduler.from_pretrained(ckpt_id, subfolder="scheduler")
+            # Create the pipeline configuration dictionary
+            pipeline_config = {
+                "transformer": self.transformer,
+                "scheduler": self.scheduler,
+                "vae": self.vae,
+                "text_encoder": self.text_encoder,
+                "text_encoder_2": self.text_encoder_2,
+                "text_encoder_3": self.text_encoder_3,
+                "tokenizer": self.tokenizer,
+                "tokenizer_2": self.tokenizer_2,
+                "tokenizer_3": self.tokenizer_3,
+            }
+            self.diffusion_expert = SD3Pipeline(**pipeline_config)
+    def initialize_diffusion_expert(self, fsdp=None):
+        print("random initiation the diffusion expert !!!")
+        self.diffusion_expert = SD3Pipeline.from_pretrained("stabilityai/stable-diffusion-3.5-medium", revision="main", torch_dtype=torch.bfloat16)
+        self.text_encoder = self.diffusion_expert.text_encoder
+        self.text_encoder_model = self.diffusion_expert.text_encoder.text_model
+        self.text_encoder_2 = self.diffusion_expert.text_encoder_2
+        self.text_encoder_2_model = self.diffusion_expert.text_encoder_2.text_model
+        self.text_encoder_3 = self.diffusion_expert.text_encoder_3
+        self.tokenizer = self.diffusion_expert.tokenizer
+        self.tokenizer_2 = self.diffusion_expert.tokenizer_2
+        self.tokenizer_3 = self.diffusion_expert.tokenizer_3
+        self.vae = self.diffusion_expert.vae
+        self.transformer = self.diffusion_expert.transformer
+        self.scheduler = self.diffusion_expert.scheduler
+        self.config.diffusion_expert =  "SD3"
+class QwenSD3Config(Qwen2_5_VLConfig):
+    model_type = "QwenSD3"
+class QwenSD3Model(QwenSD3MetaModel, Qwen2_5_VLModel):
+    config_class = QwenSD3Config
+    def __init__(self, config: Qwen2_5_VLConfig):
+        super(QwenSD3Model, self).__init__(config)
+class QwenSD3ForInferenceLM(Qwen2_5_VLForConditionalGeneration):
+    config_class = QwenSD3Config
+    def __init__(self, config):
+        Qwen2_5_VLForConditionalGeneration.__init__(self, config)
+        config.model_type = "QwenSD3"
+        self.model = QwenSD3Model(config)
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        self.post_init()
+    def get_model(self):
+        return self.model
+    @torch.no_grad()
+    def generate_image(
+        self,
+        texts: List[str],
+        diffusion_kwargs: Optional[Dict] = dict(guidance_scale = 3.5, num_inference_steps=25),
+        sde_sampling: Optional[bool] = False,
+    ):
+        if isinstance(texts, str):
+            texts = [texts]
+        if not sde_sampling:
+            output_img = self.model.diffusion_expert(
+                texts,
+                max_sequence_length=512,
+                **diffusion_kwargs,
+            ).images
+            return output_img
+        else:
+            return self.model.diffusion_expert.sde_sampling(
+                texts,
+                max_sequence_length=512,
+                **diffusion_kwargs,
+            )
+    def extract_thinking_content(self, text: str) -> str:
+        pattern = r'<answer>(.*?)</answer>'
+        matches = re.findall(pattern, text, re.DOTALL)
+        if matches:
+            return matches[-1].strip().replace("<answer>", "").replace("</answer>", "")
+        else:
+            return text.strip().replace("<answer>", "").replace("</answer>", "")
+    @torch.no_grad()
+    def generate_image_cot(
+        self,
+        texts: List[str],
+        processor: Optional[object] = None,
+        diffusion_kwargs: Optional[Dict] = dict(guidance_scale = 3.5, num_inference_steps=25),
+        llm_kwargs: Optional[Dict] = dict(max_new_tokens=256, temperature=0.7, top_p=0.9, do_sample=True),
+        cot_prompt_template: Optional[str] = None,
+    ):
+        if isinstance(texts, str):
+            texts = [texts]
+        if cot_prompt_template is None:
+            # cot_prompt_template = """Please improve the following image generation prompt to make it more detailed and specific for better image quality. Think step by step about what visual elements would make this image more compelling. Original prompt: {original_prompt}. Please provide the improved prompt in <thinking> </thinking> tags."""
+            cot_prompt_template = """Please provide an enhanced prompt for the following image generation prompt to make the image more realistic, detailed, with clear separation and precise alignment of all entities.
+            Original prompt: {original_prompt}.  Directly provide the improved prompt in <answer> </answer> tags."""
+        improved_prompts = []
+        for text in texts:
+            cot_input = cot_prompt_template.format(original_prompt=text)
+            messages = [{"role": "user", "content": cot_input}]
+            input_text_formatted = processor.apply_chat_template(
+                messages, tokenize=False, add_generation_prompt=True
+            )
+            model_inputs = processor(
+                text=[input_text_formatted],
+                return_tensors="pt"
+            ).to(self.device)
+            generated_ids = self.generate(
+                **model_inputs,
+                **llm_kwargs,
+                eos_token_id=processor.tokenizer.eos_token_id,
+                pad_token_id=processor.tokenizer.pad_token_id
+            )
+            generated_text = processor.batch_decode(
+                generated_ids[:, model_inputs['input_ids'].shape[1]:],
+                skip_special_tokens=True
+            )
+            improved_prompt = [self.extract_thinking_content(decode_text) for decode_text in generated_text]
+            improved_prompts.extend(improved_prompt)
+            print(f"Original prompt: {text}")
+            print(f"Improved prompt: {improved_prompt}")
+            print("-" * 50)
+        output_images = self.generate_image(improved_prompts, diffusion_kwargs)
+        return {
+            'images': output_images,
+            'original_prompts': texts,
+            'improved_prompts': improved_prompts
+        }
+AutoConfig.register("QwenSD3", QwenSD3Config)
+AutoModelForCausalLM.register(QwenSD3Config, QwenSD3ForInferenceLM)
+if __name__ == "__main__":
+    pass
+    model = QwenSD3ForInferenceLM.from_pretrained("Qwen/Qwen2.5-VL-3B-Instruct",torch_dtype=torch.bfloat16)
+    model.model.initialize_diffusion_expert()
+    model.model.diffusion_expert.to("cuda:0")
+    model.to("cuda:0")
+    processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-3B-Instruct")
+    text = ["a photo of a cat"]
+    images = model.generate_image(text)
+    images[0].save("test_SD3.jpg")
+    outputs = model.generate_image_cot(text, processor = processor)
+    outputs['images'][0].save("test_SD3_cot.jpg")
+    model.save_pretrained("qwensd3")
+    # model = QwenSD3ForInferenceLM.from_pretrained("qwenSD3.0", torch_dtype=torch.bfloat16)
+    # model.to("cuda:0")
+    # model.save_pretrained("qwenSD3-test-2", torch_dtype=torch.bfloat16)
+    # model = QwenSD3ForInferenceLM.from_pretrained("qwenSD3-test", torch_dtype=torch.float16)
+    # # model.to("cuda:0")
+    # for n, p in model.named_parameters():
+    #     if not p.dtype == torch.float16:
+    #         print(n)
+    # processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-3B-Instruct")
+    # text = ["a photo of a cat"]
+    # diffusion_kwargs = dict(guidance_scale = 5., num_inference_steps=20, width = 512, height = 512, generator = torch.manual_seed(0))
+    # images = model.generate_image(text, diffusion_kwargs=diffusion_kwargs)
+    # images[0].save("test_SD3.jpg")
+    # llm_kwargs = dict(max_new_tokens=256, temperature=0.7, top_p=0.9, do_sample=True, num_return_sequences=8)
+    # # generation_config = GenerationConfig.from_pretrained("Qwen/Qwen2.5-VL-3B-Instruct", trust_remote_code=True)
+    # # generation_config.num_return_sequences = 8
+    # # print(generation_config)
+    # # llm_kwargs = dict(max_new_tokens=256, temperature=0.7, top_p=0.9, do_sample=True, generation_config=generation_config)
+    # outputs = model.generate_image_cot(text, processor = processor, llm_kwargs = llm_kwargs)
+    # # save_grid_image("cat", images['images'], 2, 2)
+    # for idx, image in enumerate(outputs['images']):
+    #     image.save(f"test_SD3_cot_{idx}.jpg")

unimodel/qwensd3/sd3pipeline.py ADDED Viewed

	@@ -0,0 +1,1162 @@

+# Copyright 2024 Stability AI, The HuggingFace Team and The InstantX Team. All rights reserved.
+# Copyright 2025 Fu-Yun Wang
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Union
+import torch
+from transformers import (
+    CLIPTextModelWithProjection,
+    CLIPTokenizer,
+    SiglipImageProcessor,
+    SiglipVisionModel,
+    T5EncoderModel,
+    T5TokenizerFast,
+)
+from diffusers.image_processor import PipelineImageInput, VaeImageProcessor
+from diffusers.loaders import FromSingleFileMixin, SD3IPAdapterMixin, SD3LoraLoaderMixin
+from diffusers.models.autoencoders import AutoencoderKL
+from diffusers.models.transformers import SD3Transformer2DModel
+from diffusers.schedulers import FlowMatchEulerDiscreteScheduler
+from diffusers.utils import (
+    USE_PEFT_BACKEND,
+    is_torch_xla_available,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from diffusers.utils.torch_utils import randn_tensor
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+from diffusers.pipelines.stable_diffusion_3.pipeline_output import StableDiffusion3PipelineOutput
+import deepspeed
+from PIL import Image
+import numpy as np
+if is_torch_xla_available():
+    import torch_xla.core.xla_model as xm
+    XLA_AVAILABLE = True
+else:
+    XLA_AVAILABLE = False
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import StableDiffusion3Pipeline
+        >>> pipe = StableDiffusion3Pipeline.from_pretrained(
+        ...     "stabilityai/stable-diffusion-3-medium-diffusers", torch_dtype=torch.float16
+        ... )
+        >>> pipe.to("cuda")
+        >>> prompt = "A cat holding a sign that says hello world"
+        >>> image = pipe(prompt).images[0]
+        >>> image.save("sd3.png")
+        ```
+"""
+# Copied from diffusers.pipelines.flux.pipeline_flux.calculate_shift
+def calculate_shift(
+    image_seq_len,
+    base_seq_len: int = 256,
+    max_seq_len: int = 4096,
+    base_shift: float = 0.5,
+    max_shift: float = 1.15,
+):
+    m = (max_shift - base_shift) / (max_seq_len - base_seq_len)
+    b = base_shift - m * base_seq_len
+    mu = image_seq_len * m + b
+    return mu
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    r"""
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+class StableDiffusion3Pipeline(DiffusionPipeline, SD3LoraLoaderMixin, FromSingleFileMixin, SD3IPAdapterMixin):
+    r"""
+    Args:
+        transformer ([`SD3Transformer2DModel`]):
+            Conditional Transformer (MMDiT) architecture to denoise the encoded image latents.
+        scheduler ([`FlowMatchEulerDiscreteScheduler`]):
+            A scheduler to be used in combination with `transformer` to denoise the encoded image latents.
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModelWithProjection`]):
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModelWithProjection),
+            specifically the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant,
+            with an additional added projection layer that is initialized with a diagonal matrix with the `hidden_size`
+            as its dimension.
+        text_encoder_2 ([`CLIPTextModelWithProjection`]):
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModelWithProjection),
+            specifically the
+            [laion/CLIP-ViT-bigG-14-laion2B-39B-b160k](https://huggingface.co/laion/CLIP-ViT-bigG-14-laion2B-39B-b160k)
+            variant.
+        text_encoder_3 ([`T5EncoderModel`]):
+            Frozen text-encoder. Stable Diffusion 3 uses
+            [T5](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5EncoderModel), specifically the
+            [t5-v1_1-xxl](https://huggingface.co/google/t5-v1_1-xxl) variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        tokenizer_2 (`CLIPTokenizer`):
+            Second Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        tokenizer_3 (`T5TokenizerFast`):
+            Tokenizer of class
+            [T5Tokenizer](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5Tokenizer).
+        image_encoder (`SiglipVisionModel`, *optional*):
+            Pre-trained Vision Model for IP Adapter.
+        feature_extractor (`SiglipImageProcessor`, *optional*):
+            Image processor for IP Adapter.
+    """
+    model_cpu_offload_seq = "text_encoder->text_encoder_2->text_encoder_3->image_encoder->transformer->vae"
+    _optional_components = ["image_encoder", "feature_extractor"]
+    _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds", "negative_pooled_prompt_embeds"]
+    def __init__(
+        self,
+        transformer: SD3Transformer2DModel,
+        scheduler: FlowMatchEulerDiscreteScheduler,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModelWithProjection,
+        tokenizer: CLIPTokenizer,
+        text_encoder_2: CLIPTextModelWithProjection,
+        tokenizer_2: CLIPTokenizer,
+        text_encoder_3: T5EncoderModel,
+        tokenizer_3: T5TokenizerFast,
+        image_encoder: SiglipVisionModel = None,
+        feature_extractor: SiglipImageProcessor = None,
+    ):
+        super().__init__()
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            text_encoder_2=text_encoder_2,
+            text_encoder_3=text_encoder_3,
+            tokenizer=tokenizer,
+            tokenizer_2=tokenizer_2,
+            tokenizer_3=tokenizer_3,
+            transformer=transformer,
+            scheduler=scheduler,
+            image_encoder=image_encoder,
+            feature_extractor=feature_extractor,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) if getattr(self, "vae", None) else 8
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.tokenizer_max_length = (
+            self.tokenizer.model_max_length if hasattr(self, "tokenizer") and self.tokenizer is not None else 77
+        )
+        self.default_sample_size = (
+            self.transformer.config.sample_size
+            if hasattr(self, "transformer") and self.transformer is not None
+            else 128
+        )
+        self.patch_size = (
+            self.transformer.config.patch_size if hasattr(self, "transformer") and self.transformer is not None else 2
+        )
+    def _get_t5_prompt_embeds(
+        self,
+        prompt: Union[str, List[str]] = None,
+        num_images_per_prompt: int = 1,
+        max_sequence_length: int = 256,
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+    ):
+        device = device or self._execution_device
+        dtype = dtype or self.text_encoder.dtype
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        batch_size = len(prompt)
+        if self.text_encoder_3 is None:
+            return torch.zeros(
+                (
+                    batch_size * num_images_per_prompt,
+                    self.tokenizer_max_length,
+                    self.transformer.config.joint_attention_dim,
+                ),
+                device=device,
+                dtype=dtype,
+            )
+        text_inputs = self.tokenizer_3(
+            prompt,
+            padding="max_length",
+            max_length=max_sequence_length,
+            truncation=True,
+            add_special_tokens=True,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        untruncated_ids = self.tokenizer_3(prompt, padding="longest", return_tensors="pt").input_ids
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer_3.batch_decode(untruncated_ids[:, self.tokenizer_max_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because `max_sequence_length` is set to "
+                f" {max_sequence_length} tokens: {removed_text}"
+            )
+        prompt_embeds = self.text_encoder_3(text_input_ids.to(device))[0]
+        dtype = self.text_encoder_3.dtype
+        prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+        _, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings and attention mask for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+        return prompt_embeds
+    def _get_clip_prompt_embeds(
+        self,
+        prompt: Union[str, List[str]],
+        num_images_per_prompt: int = 1,
+        device: Optional[torch.device] = None,
+        clip_skip: Optional[int] = None,
+        clip_model_index: int = 0,
+    ):
+        device = device or self._execution_device
+        clip_tokenizers = [self.tokenizer, self.tokenizer_2]
+        clip_text_encoders = [self.text_encoder, self.text_encoder_2]
+        tokenizer = clip_tokenizers[clip_model_index]
+        text_encoder = clip_text_encoders[clip_model_index]
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        batch_size = len(prompt)
+        text_inputs = tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer_max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        untruncated_ids = tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = tokenizer.batch_decode(untruncated_ids[:, self.tokenizer_max_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.tokenizer_max_length} tokens: {removed_text}"
+            )
+        prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=True)
+        pooled_prompt_embeds = prompt_embeds[0]
+        if clip_skip is None:
+            prompt_embeds = prompt_embeds.hidden_states[-2]
+        else:
+            prompt_embeds = prompt_embeds.hidden_states[-(clip_skip + 2)]
+        prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
+        _, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+        pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        pooled_prompt_embeds = pooled_prompt_embeds.view(batch_size * num_images_per_prompt, -1)
+        return prompt_embeds, pooled_prompt_embeds
+    def encode_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        prompt_2: Union[str, List[str]],
+        prompt_3: Union[str, List[str]],
+        device: Optional[torch.device] = None,
+        num_images_per_prompt: int = 1,
+        do_classifier_free_guidance: bool = True,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        negative_prompt_2: Optional[Union[str, List[str]]] = None,
+        negative_prompt_3: Optional[Union[str, List[str]]] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        clip_skip: Optional[int] = None,
+        max_sequence_length: int = 256,
+        lora_scale: Optional[float] = None,
+    ):
+        r"""
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                used in all text-encoders
+            prompt_3 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_3` and `text_encoder_3`. If not defined, `prompt` is
+                used in all text-encoders
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
+                `text_encoder_2`. If not defined, `negative_prompt` is used in all the text-encoders.
+            negative_prompt_3 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_3` and
+                `text_encoder_3`. If not defined, `negative_prompt` is used in all the text-encoders.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+            lora_scale (`float`, *optional*):
+                A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+        """
+        device = device or self._execution_device
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, SD3LoraLoaderMixin):
+            self._lora_scale = lora_scale
+            # dynamically adjust the LoRA scale
+            if self.text_encoder is not None and USE_PEFT_BACKEND:
+                scale_lora_layers(self.text_encoder, lora_scale)
+            if self.text_encoder_2 is not None and USE_PEFT_BACKEND:
+                scale_lora_layers(self.text_encoder_2, lora_scale)
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        if prompt is not None:
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        if prompt_embeds is None:
+            prompt_2 = prompt_2 or prompt
+            prompt_2 = [prompt_2] if isinstance(prompt_2, str) else prompt_2
+            prompt_3 = prompt_3 or prompt
+            prompt_3 = [prompt_3] if isinstance(prompt_3, str) else prompt_3
+            prompt_embed, pooled_prompt_embed = self._get_clip_prompt_embeds(
+                prompt=prompt,
+                device=device,
+                num_images_per_prompt=num_images_per_prompt,
+                clip_skip=clip_skip,
+                clip_model_index=0,
+            )
+            prompt_2_embed, pooled_prompt_2_embed = self._get_clip_prompt_embeds(
+                prompt=prompt_2,
+                device=device,
+                num_images_per_prompt=num_images_per_prompt,
+                clip_skip=clip_skip,
+                clip_model_index=1,
+            )
+            clip_prompt_embeds = torch.cat([prompt_embed, prompt_2_embed], dim=-1)
+            t5_prompt_embed = self._get_t5_prompt_embeds(
+                prompt=prompt_3,
+                num_images_per_prompt=num_images_per_prompt,
+                max_sequence_length=max_sequence_length,
+                device=device,
+            )
+            clip_prompt_embeds = torch.nn.functional.pad(
+                clip_prompt_embeds, (0, t5_prompt_embed.shape[-1] - clip_prompt_embeds.shape[-1])
+            )
+            prompt_embeds = torch.cat([clip_prompt_embeds, t5_prompt_embed], dim=-2)
+            pooled_prompt_embeds = torch.cat([pooled_prompt_embed, pooled_prompt_2_embed], dim=-1)
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            negative_prompt = negative_prompt or ""
+            negative_prompt_2 = negative_prompt_2 or negative_prompt
+            negative_prompt_3 = negative_prompt_3 or negative_prompt
+            # normalize str to list
+            negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
+            negative_prompt_2 = (
+                batch_size * [negative_prompt_2] if isinstance(negative_prompt_2, str) else negative_prompt_2
+            )
+            negative_prompt_3 = (
+                batch_size * [negative_prompt_3] if isinstance(negative_prompt_3, str) else negative_prompt_3
+            )
+            if prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            # with deepspeed.zero.GatheredParameters(self.text_encoder.parameters()):
+            #     negative_prompt_embed, negative_pooled_prompt_embed = self._get_clip_prompt_embeds(
+            #         negative_prompt,
+            #         device=device,
+            #         num_images_per_prompt=num_images_per_prompt,
+            #         clip_skip=None,
+            #         clip_model_index=0,
+            #     )
+            # with deepspeed.zero.GatheredParameters(self.text_encoder_2.parameters()):
+            #     negative_prompt_2_embed, negative_pooled_prompt_2_embed = self._get_clip_prompt_embeds(
+            #         negative_prompt_2,
+            #         device=device,
+            #         num_images_per_prompt=num_images_per_prompt,
+            #         clip_skip=None,
+            #         clip_model_index=1,
+            #     )
+            negative_prompt_embed, negative_pooled_prompt_embed = self._get_clip_prompt_embeds(
+                negative_prompt,
+                device=device,
+                num_images_per_prompt=num_images_per_prompt,
+                clip_skip=None,
+                clip_model_index=0,
+            )
+            negative_prompt_2_embed, negative_pooled_prompt_2_embed = self._get_clip_prompt_embeds(
+                negative_prompt_2,
+                device=device,
+                num_images_per_prompt=num_images_per_prompt,
+                clip_skip=None,
+                clip_model_index=1,
+            )
+            negative_clip_prompt_embeds = torch.cat([negative_prompt_embed, negative_prompt_2_embed], dim=-1)
+            t5_negative_prompt_embed = self._get_t5_prompt_embeds(
+                prompt=negative_prompt_3,
+                num_images_per_prompt=num_images_per_prompt,
+                max_sequence_length=max_sequence_length,
+                device=device,
+            )
+            negative_clip_prompt_embeds = torch.nn.functional.pad(
+                negative_clip_prompt_embeds,
+                (0, t5_negative_prompt_embed.shape[-1] - negative_clip_prompt_embeds.shape[-1]),
+            )
+            negative_prompt_embeds = torch.cat([negative_clip_prompt_embeds, t5_negative_prompt_embed], dim=-2)
+            negative_pooled_prompt_embeds = torch.cat(
+                [negative_pooled_prompt_embed, negative_pooled_prompt_2_embed], dim=-1
+            )
+        if self.text_encoder is not None:
+            if isinstance(self, SD3LoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
+        if self.text_encoder_2 is not None:
+            if isinstance(self, SD3LoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder_2, lora_scale)
+        return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds
+    def check_inputs(
+        self,
+        prompt,
+        prompt_2,
+        prompt_3,
+        height,
+        width,
+        negative_prompt=None,
+        negative_prompt_2=None,
+        negative_prompt_3=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        pooled_prompt_embeds=None,
+        negative_pooled_prompt_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+        max_sequence_length=None,
+    ):
+        if (
+            height % (self.vae_scale_factor * self.patch_size) != 0
+            or width % (self.vae_scale_factor * self.patch_size) != 0
+        ):
+            raise ValueError(
+                f"`height` and `width` have to be divisible by {self.vae_scale_factor * self.patch_size} but are {height} and {width}."
+                f"You can use height {height - height % (self.vae_scale_factor * self.patch_size)} and width {width - width % (self.vae_scale_factor * self.patch_size)}."
+            )
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt_2 is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt_2`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt_3 is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt_3`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+        elif prompt_2 is not None and (not isinstance(prompt_2, str) and not isinstance(prompt_2, list)):
+            raise ValueError(f"`prompt_2` has to be of type `str` or `list` but is {type(prompt_2)}")
+        elif prompt_3 is not None and (not isinstance(prompt_3, str) and not isinstance(prompt_3, list)):
+            raise ValueError(f"`prompt_3` has to be of type `str` or `list` but is {type(prompt_3)}")
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+        elif negative_prompt_2 is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt_2`: {negative_prompt_2} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+        elif negative_prompt_3 is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt_3`: {negative_prompt_3} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+        if prompt_embeds is not None and pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `prompt_embeds` are provided, `pooled_prompt_embeds` also have to be passed. Make sure to generate `pooled_prompt_embeds` from the same text encoder that was used to generate `prompt_embeds`."
+            )
+        if negative_prompt_embeds is not None and negative_pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `negative_prompt_embeds` are provided, `negative_pooled_prompt_embeds` also have to be passed. Make sure to generate `negative_pooled_prompt_embeds` from the same text encoder that was used to generate `negative_prompt_embeds`."
+            )
+        if max_sequence_length is not None and max_sequence_length > 512:
+            raise ValueError(f"`max_sequence_length` cannot be greater than 512 but is {max_sequence_length}")
+    def prepare_latents(
+        self,
+        batch_size,
+        num_channels_latents,
+        height,
+        width,
+        dtype,
+        device,
+        generator,
+        latents=None,
+    ):
+        if latents is not None:
+            return latents.to(device=device, dtype=dtype)
+        shape = (
+            batch_size,
+            num_channels_latents,
+            int(height) // self.vae_scale_factor,
+            int(width) // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+        latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        return latents
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+    @property
+    def skip_guidance_layers(self):
+        return self._skip_guidance_layers
+    @property
+    def clip_skip(self):
+        return self._clip_skip
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://huggingface.co/papers/2205.11487 . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1
+    @property
+    def joint_attention_kwargs(self):
+        return self._joint_attention_kwargs
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+    @property
+    def interrupt(self):
+        return self._interrupt
+    def encode_image(self, image: PipelineImageInput, device: torch.device) -> torch.Tensor:
+        """Encodes the given image into a feature representation using a pre-trained image encoder.
+        Args:
+            image (`PipelineImageInput`):
+                Input image to be encoded.
+            device: (`torch.device`):
+                Torch device.
+        Returns:
+            `torch.Tensor`: The encoded image feature representation.
+        """
+        if not isinstance(image, torch.Tensor):
+            image = self.feature_extractor(image, return_tensors="pt").pixel_values
+        image = image.to(device=device, dtype=self.dtype)
+        return self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
+    def prepare_ip_adapter_image_embeds(
+        self,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        ip_adapter_image_embeds: Optional[torch.Tensor] = None,
+        device: Optional[torch.device] = None,
+        num_images_per_prompt: int = 1,
+        do_classifier_free_guidance: bool = True,
+    ) -> torch.Tensor:
+        """Prepares image embeddings for use in the IP-Adapter.
+        Either `ip_adapter_image` or `ip_adapter_image_embeds` must be passed.
+        Args:
+            ip_adapter_image (`PipelineImageInput`, *optional*):
+                The input image to extract features from for IP-Adapter.
+            ip_adapter_image_embeds (`torch.Tensor`, *optional*):
+                Precomputed image embeddings.
+            device: (`torch.device`, *optional*):
+                Torch device.
+            num_images_per_prompt (`int`, defaults to 1):
+                Number of images that should be generated per prompt.
+            do_classifier_free_guidance (`bool`, defaults to True):
+                Whether to use classifier free guidance or not.
+        """
+        device = device or self._execution_device
+        if ip_adapter_image_embeds is not None:
+            if do_classifier_free_guidance:
+                single_negative_image_embeds, single_image_embeds = ip_adapter_image_embeds.chunk(2)
+            else:
+                single_image_embeds = ip_adapter_image_embeds
+        elif ip_adapter_image is not None:
+            single_image_embeds = self.encode_image(ip_adapter_image, device)
+            if do_classifier_free_guidance:
+                single_negative_image_embeds = torch.zeros_like(single_image_embeds)
+        else:
+            raise ValueError("Neither `ip_adapter_image_embeds` or `ip_adapter_image_embeds` were provided.")
+        image_embeds = torch.cat([single_image_embeds] * num_images_per_prompt, dim=0)
+        if do_classifier_free_guidance:
+            negative_image_embeds = torch.cat([single_negative_image_embeds] * num_images_per_prompt, dim=0)
+            image_embeds = torch.cat([negative_image_embeds, image_embeds], dim=0)
+        return image_embeds.to(device=device)
+    def enable_sequential_cpu_offload(self, *args, **kwargs):
+        if self.image_encoder is not None and "image_encoder" not in self._exclude_from_cpu_offload:
+            logger.warning(
+                "`pipe.enable_sequential_cpu_offload()` might fail for `image_encoder` if it uses "
+                "`torch.nn.MultiheadAttention`. You can exclude `image_encoder` from CPU offloading by calling "
+                "`pipe._exclude_from_cpu_offload.append('image_encoder')` before `pipe.enable_sequential_cpu_offload()`."
+            )
+        super().enable_sequential_cpu_offload(*args, **kwargs)
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        prompt_2: Optional[Union[str, List[str]]] = None,
+        prompt_3: Optional[Union[str, List[str]]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 28,
+        sigmas: Optional[List[float]] = None,
+        guidance_scale: float = 7.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        negative_prompt_2: Optional[Union[str, List[str]]] = None,
+        negative_prompt_3: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        ip_adapter_image_embeds: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        joint_attention_kwargs: Optional[Dict[str, Any]] = None,
+        clip_skip: Optional[int] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        max_sequence_length: int = 256,
+        skip_guidance_layers: List[int] = None,
+        skip_layer_guidance_scale: float = 2.8,
+        skip_layer_guidance_stop: float = 0.2,
+        skip_layer_guidance_start: float = 0.01,
+        mu: Optional[float] = None,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                will be used instead
+            prompt_3 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to `tokenizer_3` and `text_encoder_3`. If not defined, `prompt` is
+                will be used instead
+            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The height in pixels of the generated image. This is set to 1024 by default for the best results.
+            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The width in pixels of the generated image. This is set to 1024 by default for the best results.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            sigmas (`List[float]`, *optional*):
+                Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
+                their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
+                will be used.
+            guidance_scale (`float`, *optional*, defaults to 7.0):
+                Guidance scale as defined in [Classifier-Free Diffusion
+                Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
+                of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
+                `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
+                the text `prompt`, usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
+                `text_encoder_2`. If not defined, `negative_prompt` is used instead
+            negative_prompt_3 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_3` and
+                `text_encoder_3`. If not defined, `negative_prompt` is used instead
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+            ip_adapter_image (`PipelineImageInput`, *optional*):
+                Optional image input to work with IP Adapters.
+            ip_adapter_image_embeds (`torch.Tensor`, *optional*):
+                Pre-generated image embeddings for IP-Adapter. Should be a tensor of shape `(batch_size, num_images,
+                emb_dim)`. It should contain the negative image embedding if `do_classifier_free_guidance` is set to
+                `True`. If not provided, embeddings are computed from the `ip_adapter_image` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion_3.StableDiffusion3PipelineOutput`] instead of
+                a plain tuple.
+            joint_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+            max_sequence_length (`int` defaults to 256): Maximum sequence length to use with the `prompt`.
+            skip_guidance_layers (`List[int]`, *optional*):
+                A list of integers that specify layers to skip during guidance. If not provided, all layers will be
+                used for guidance. If provided, the guidance will only be applied to the layers specified in the list.
+                Recommended value by StabiltyAI for Stable Diffusion 3.5 Medium is [7, 8, 9].
+            skip_layer_guidance_scale (`int`, *optional*): The scale of the guidance for the layers specified in
+                `skip_guidance_layers`. The guidance will be applied to the layers specified in `skip_guidance_layers`
+                with a scale of `skip_layer_guidance_scale`. The guidance will be applied to the rest of the layers
+                with a scale of `1`.
+            skip_layer_guidance_stop (`int`, *optional*): The step at which the guidance for the layers specified in
+                `skip_guidance_layers` will stop. The guidance will be applied to the layers specified in
+                `skip_guidance_layers` until the fraction specified in `skip_layer_guidance_stop`. Recommended value by
+                StabiltyAI for Stable Diffusion 3.5 Medium is 0.2.
+            skip_layer_guidance_start (`int`, *optional*): The step at which the guidance for the layers specified in
+                `skip_guidance_layers` will start. The guidance will be applied to the layers specified in
+                `skip_guidance_layers` from the fraction specified in `skip_layer_guidance_start`. Recommended value by
+                StabiltyAI for Stable Diffusion 3.5 Medium is 0.01.
+            mu (`float`, *optional*): `mu` value used for `dynamic_shifting`.
+        Examples:
+        Returns:
+            [`~pipelines.stable_diffusion_3.StableDiffusion3PipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion_3.StableDiffusion3PipelineOutput`] if `return_dict` is True, otherwise a
+            `tuple`. When returning a tuple, the first element is a list with the generated images.
+        """
+        height = height or self.default_sample_size * self.vae_scale_factor
+        width = width or self.default_sample_size * self.vae_scale_factor
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            prompt_2,
+            prompt_3,
+            height,
+            width,
+            negative_prompt=negative_prompt,
+            negative_prompt_2=negative_prompt_2,
+            negative_prompt_3=negative_prompt_3,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+            callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
+            max_sequence_length=max_sequence_length,
+        )
+        self._guidance_scale = guidance_scale
+        self._skip_layer_guidance_scale = skip_layer_guidance_scale
+        self._clip_skip = clip_skip
+        self._joint_attention_kwargs = joint_attention_kwargs
+        self._interrupt = False
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        device = self._execution_device
+        lora_scale = (
+            self.joint_attention_kwargs.get("scale", None) if self.joint_attention_kwargs is not None else None
+        )
+        (
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+        ) = self.encode_prompt(
+            prompt=prompt,
+            prompt_2=prompt_2,
+            prompt_3=prompt_3,
+            negative_prompt=negative_prompt,
+            negative_prompt_2=negative_prompt_2,
+            negative_prompt_3=negative_prompt_3,
+            do_classifier_free_guidance=self.do_classifier_free_guidance,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+            device=device,
+            clip_skip=self.clip_skip,
+            num_images_per_prompt=num_images_per_prompt,
+            max_sequence_length=max_sequence_length,
+            lora_scale=lora_scale,
+        )
+        if self.do_classifier_free_guidance:
+            if skip_guidance_layers is not None:
+                original_prompt_embeds = prompt_embeds
+                original_pooled_prompt_embeds = pooled_prompt_embeds
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+            pooled_prompt_embeds = torch.cat([negative_pooled_prompt_embeds, pooled_prompt_embeds], dim=0)
+        # 4. Prepare latent variables
+        num_channels_latents = self.transformer.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+        # 5. Prepare timesteps
+        scheduler_kwargs = {}
+        if self.scheduler.config.get("use_dynamic_shifting", None) and mu is None:
+            _, _, height, width = latents.shape
+            image_seq_len = (height // self.transformer.config.patch_size) * (
+                width // self.transformer.config.patch_size
+            )
+            mu = calculate_shift(
+                image_seq_len,
+                self.scheduler.config.get("base_image_seq_len", 256),
+                self.scheduler.config.get("max_image_seq_len", 4096),
+                self.scheduler.config.get("base_shift", 0.5),
+                self.scheduler.config.get("max_shift", 1.16),
+            )
+            scheduler_kwargs["mu"] = mu
+        elif mu is not None:
+            scheduler_kwargs["mu"] = mu
+        timesteps, num_inference_steps = retrieve_timesteps(
+            self.scheduler,
+            num_inference_steps,
+            device,
+            sigmas=sigmas,
+            **scheduler_kwargs,
+        )
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+        self._num_timesteps = len(timesteps)
+        # 6. Prepare image embeddings
+        if (ip_adapter_image is not None and self.is_ip_adapter_active) or ip_adapter_image_embeds is not None:
+            ip_adapter_image_embeds = self.prepare_ip_adapter_image_embeds(
+                ip_adapter_image,
+                ip_adapter_image_embeds,
+                device,
+                batch_size * num_images_per_prompt,
+                self.do_classifier_free_guidance,
+            )
+            if self.joint_attention_kwargs is None:
+                self._joint_attention_kwargs = {"ip_adapter_image_embeds": ip_adapter_image_embeds}
+            else:
+                self._joint_attention_kwargs.update(ip_adapter_image_embeds=ip_adapter_image_embeds)
+        # 7. Denoising loop
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+                # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+                timestep = t.expand(latent_model_input.shape[0])
+                noise_pred = self.transformer(
+                    hidden_states=latent_model_input,
+                    timestep=timestep,
+                    encoder_hidden_states=prompt_embeds,
+                    pooled_projections=pooled_prompt_embeds,
+                    joint_attention_kwargs=self.joint_attention_kwargs,
+                    return_dict=False,
+                )[0]
+                # perform guidance
+                if self.do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
+                    should_skip_layers = (
+                        True
+                        if i > num_inference_steps * skip_layer_guidance_start
+                        and i < num_inference_steps * skip_layer_guidance_stop
+                        else False
+                    )
+                    if skip_guidance_layers is not None and should_skip_layers:
+                        timestep = t.expand(latents.shape[0])
+                        latent_model_input = latents
+                        noise_pred_skip_layers = self.transformer(
+                            hidden_states=latent_model_input,
+                            timestep=timestep,
+                            encoder_hidden_states=original_prompt_embeds,
+                            pooled_projections=original_pooled_prompt_embeds,
+                            joint_attention_kwargs=self.joint_attention_kwargs,
+                            return_dict=False,
+                            skip_layers=skip_guidance_layers,
+                        )[0]
+                        noise_pred = (
+                            noise_pred + (noise_pred_text - noise_pred_skip_layers) * self._skip_layer_guidance_scale
+                        )
+                latents_dtype = latents.dtype
+                latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
+                if latents.dtype != latents_dtype:
+                    if torch.backends.mps.is_available():
+                        # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
+                        latents = latents.to(latents_dtype)
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+                    negative_pooled_prompt_embeds = callback_outputs.pop(
+                        "negative_pooled_prompt_embeds", negative_pooled_prompt_embeds
+                    )
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                if XLA_AVAILABLE:
+                    xm.mark_step()
+        if output_type == "latent":
+            image = latents
+        else:
+            mean_img = torch.mean(latents[0], dim=0).cpu().float().numpy()
+            Image.fromarray(((mean_img - mean_img.min()) / (mean_img.max() - mean_img.min()) * 255).astype(np.uint8)).save('mean.png')
+            latents = (latents / self.vae.config.scaling_factor) + self.vae.config.shift_factor
+            image = self.vae.decode(latents, return_dict=False)[0]
+            image = self.image_processor.postprocess(image, output_type=output_type)
+        # Offload all models
+        self.maybe_free_model_hooks()
+        if not return_dict:
+            return (image,)
+        return StableDiffusion3PipelineOutput(images=image)