jayyap commited on Mar 25, 2023

Commit

9d51df0

1 Parent(s): c69f654

Upload 28 files

Browse files

Files changed (28) hide show

.gitignore +143 -0
FAQ.md +21 -0
LICENSE +201 -0
README.md +234 -3
annotator.md +49 -0
cog.yaml +51 -0
download_weights.py +17 -0
environment.yaml +33 -0
gradio_annotator.py +160 -0
gradio_canny2image.py +42 -0
gradio_depth2image.py +44 -0
gradio_fake_scribble2image.py +48 -0
gradio_hed2image.py +44 -0
gradio_hough2image.py +44 -0
gradio_normal2image.py +44 -0
gradio_pose2image.py +44 -0
gradio_scribble2image.py +41 -0
gradio_scribble2image_interactive.py +45 -0
gradio_seg2image.py +43 -0
output.0.png +0 -0
output.1.png +0 -0
predict.py +216 -0
tool_add_control.py +49 -0
train.md +251 -0
tutorial_dataset.py +37 -0
tutorial_dataset_test.py +12 -0
tutorial_train.py +36 -0
utils.py +39 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,143 @@

+.idea/
+.cog
+oryx-build-commands.txt
+tmp/*.png
+*.png
+training/
+*.pth
+*.pt
+*.ckpt
+my_fix.py
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+.python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/

FAQ.md ADDED Viewed

	@@ -0,0 +1,21 @@

+# FAQs
+**Q:** If the weight of a conv layer is zero, the gradient will also be zero, and the network will not learn anything. Why "zero convolution" works?
+**A:** This is wrong. Let us consider a very simple
+$$y=wx+b$$
+and we have
+$$\partial y/\partial w=x, \partial y/\partial x=w, \partial y/\partial b=1$$
+and if $w=0$ and $x \neq 0$, then
+$$\partial y/\partial w \neq 0, \partial y/\partial x=0, \partial y/\partial b\neq 0$$
+which means as long as $x \neq 0$, one gradient descent iteration will make $w$ non-zero. Then
+$$\partial y/\partial x\neq 0$$
+so that the zero convolutions will progressively become a common conv layer with non-zero weights.

LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

README.md CHANGED Viewed

@@ -1,3 +1,234 @@
----
-license: unknown
----

+# ControlNet
+Cog implementation of [Adding Conditional Control to Text-to-Image Diffusion Models](https://github.com/lllyasviel/ControlNet/raw/main/github_page/control.pdf).
+To run this Cog model:
+1. clone this repo
+1. run `cog run python download_weights.py --model_type='desired-model-type-goes-here'`
+1. run `cog predict -i image='@your_img.png' -i prompt='your prompt'`
+1. push to Replicate with `cog push`, if you like
+# About ControlNet
+ControlNet is a neural network structure to control diffusion models by adding extra conditions.
+![img](github_page/he.png)
+It copys the weights of neural network blocks into a "locked" copy and a "trainable" copy.
+The "trainable" one learns your condition. The "locked" one preserves your model.
+Thanks to this, training with small dataset of image pairs will not destroy the production-ready diffusion models.
+The "zero convolution" is 1×1 convolution with both weight and bias initialized as zeros.
+Before training, all zero convolutions output zeros, and ControlNet will not cause any distortion.
+No layer is trained from scratch. You are still fine-tuning. Your original model is safe.
+This allows training on small-scale or even personal devices.
+This is also friendly to merge/replacement/offsetting of models/weights/blocks/layers.
+### FAQ
+**Q:** But wait, if the weight of a conv layer is zero, the gradient will also be zero, and the network will not learn anything. Why "zero convolution" works?
+**A:** This is not true. [See an explanation here](FAQ.md).
+# Stable Diffusion + ControlNet
+By repeating the above simple structure 14 times, we can control stable diffusion in this way:
+![img](github_page/sd.png)
+Note that the way we connect layers is computational efficient. The original SD encoder does not need to store gradients (the locked original SD Encoder Block 1234 and Middle). The required GPU memory is not much larger than original SD, although many layers are added. Great!
+# Production-Ready Pretrained Models
+First create a new conda environment
+    conda env create -f environment.yaml
+    conda activate control
+All models and detectors can be downloaded from [our huggingface page](https://huggingface.co/lllyasviel/ControlNet). Make sure that SD models are put in "ControlNet/models" and detectors are put in "ControlNet/annotator/ckpts". Make sure that you download all necessary pretrained weights and detector models from that huggingface page, including HED edge detection model, Midas depth estimation model, Openpose, and so on.
+We provide 9 Gradio apps with these models.
+All test images can be found at the folder "test_imgs".
+## ControlNet with Canny Edge
+Stable Diffusion 1.5 + ControlNet (using simple Canny edge detection)
+    python gradio_canny2image.py
+The Gradio app also allows you to change the Canny edge thresholds. Just try it for more details.
+Prompt: "bird"
+![p](github_page/p1.png)
+Prompt: "cute dog"
+![p](github_page/p2.png)
+## ControlNet with M-LSD Lines
+Stable Diffusion 1.5 + ControlNet (using simple M-LSD straight line detection)
+    python gradio_hough2image.py
+The Gradio app also allows you to change the M-LSD thresholds. Just try it for more details.
+Prompt: "room"
+![p](github_page/p3.png)
+Prompt: "building"
+![p](github_page/p4.png)
+## ControlNet with HED Boundary
+Stable Diffusion 1.5 + ControlNet (using soft HED Boundary)
+    python gradio_hed2image.py
+The soft HED Boundary will preserve many details in input images, making this app suitable for recoloring and stylizing. Just try it for more details.
+Prompt: "oil painting of handsome old man, masterpiece"
+![p](github_page/p5.png)
+Prompt: "Cyberpunk robot"
+![p](github_page/p6.png)
+## ControlNet with User Scribbles
+Stable Diffusion 1.5 + ControlNet (using Scribbles)
+    python gradio_scribble2image.py
+Note that the UI is based on Gradio, and Gradio is somewhat difficult to customize. Right now you need to draw scribbles outside the UI (using your favorite drawing software, for example, MS Paint) and then import the scribble image to Gradio.
+Prompt: "turtle"
+![p](github_page/p7.png)
+Prompt: "hot air balloon"
+![p](github_page/p8.png)
+### Interactive Interface
+We actually provide an interactive interface
+    python gradio_scribble2image_interactive.py
+However, because gradio is very [buggy](https://github.com/gradio-app/gradio/issues/3166) and difficult to customize, right now, user need to first set canvas width and heights and then click "Open drawing canvas" to get a drawing area. Please do not upload image to that drawing canvas. Also, the drawing area is very small; it should be bigger. But I failed to find out how to make it larger. Again, gradio is really buggy.
+The below dog sketch is drawn by me. Perhaps we should draw a better dog for showcase.
+Prompt: "dog in a room"
+![p](github_page/p20.png)
+## ControlNet with Fake Scribbles
+Stable Diffusion 1.5 + ControlNet (using fake scribbles)
+    python gradio_fake_scribble2image.py
+Sometimes we are lazy, and we do not want to draw scribbles. This script use the exactly same scribble-based model but use a simple algorithm to synthesize scribbles from input images.
+Prompt: "bag"
+![p](github_page/p9.png)
+Prompt: "shose" (Note that "shose" is a typo; it should be "shoes". But it still seems to work.)
+![p](github_page/p10.png)
+## ControlNet with Human Pose
+Stable Diffusion 1.5 + ControlNet (using human pose)
+    python gradio_pose2image.py
+Apparently, this model deserves a better UI to directly manipulate pose skeleton. However, again, Gradio is somewhat difficult to customize. Right now you need to input an image and then the Openpose will detect the pose for you.
+Prompt: "Chief in the kitchen"
+![p](github_page/p11.png)
+Prompt: "An astronaut on the moon"
+![p](github_page/p12.png)
+## ControlNet with Semantic Segmentation
+Stable Diffusion 1.5 + ControlNet (using semantic segmentation)
+    python gradio_seg2image.py
+This model use ADE20K's segmentation protocol. Again, this model deserves a better UI to directly draw the segmentations. However, again, Gradio is somewhat difficult to customize. Right now you need to input an image and then a model called Uniformer will detect the segmentations for you. Just try it for more details.
+Prompt: "House"
+![p](github_page/p13.png)
+Prompt: "River"
+![p](github_page/p14.png)
+## ControlNet with Depth
+Stable Diffusion 1.5 + ControlNet (using depth map)
+    python gradio_depth2image.py
+Great! Now SD 1.5 also have a depth control. FINALLY. So many possibilities (considering SD1.5 has much more community models than SD2).
+Note that different from Stability's model, the ControlNet receive the full 512×512 depth map, rather than 64×64 depth. Note that Stability's SD2 depth model use 64*64 depth maps. This means that the ControlNet will preserve more details in the depth map.
+This is always a strength because if users do not want to preserve more details, they can simply use another SD to post-process an i2i. But if they want to preserve more details, ControlNet becomes their only choice. Again, SD2 uses 64×64 depth, we use 512×512.
+Prompt: "Stormtrooper's lecture"
+![p](github_page/p15.png)
+## ControlNet with Normal Map
+Stable Diffusion 1.5 + ControlNet (using normal map)
+    python gradio_normal2image.py
+This model use normal map. Rightnow in the APP, the normal is computed from the midas depth map and a user threshold (to determine how many area is background with identity normal face to viewer, tune the "Normal background threshold" in the gradio app to get a feeling).
+Prompt: "Cute toy"
+![p](github_page/p17.png)
+Prompt: "Plaster statue of Abraham Lincoln"
+![p](github_page/p18.png)
+Compared to depth model, this model seems to be a bit better at preserving the geometry. This is intuitive: minor details are not salient in depth maps, but are salient in normal maps. Below is the depth result with same inputs. You can see that the hairstyle of the man in the input image is modified by depth model, but preserved by the normal model.
+Prompt: "Plaster statue of Abraham Lincoln"
+![p](github_page/p19.png)
+## ControlNet with Anime Line Drawing
+We also trained a relatively simple ControlNet for anime line drawings. This tool may be useful for artistic creations. (Although the image details in the results is a bit modified, since it still diffuse latent images.)
+This model is not available right now. We need to evaluate the potential risks before releasing this model.
+![p](github_page/p21.png)
+# Annotate Your Own Data
+We provide simple python scripts to process images.
+[See a gradio example here](annotator.md).
+# Train with Your Own Data
+Training a ControlNet is as easy as (or even easier than) training a simple pix2pix.
+[See the steps here](train.md).
+# Citation
+    @misc{control2023,
+    author = "Lvmin Zhang and Maneesh Agrawala",
+    title = "Adding Conditional Control to Text-to-Image Diffusion Models",
+    month = "Feb",
+    year = "2022"
+    }
+[Download the paper here](https://github.com/lllyasviel/ControlNet/raw/main/github_page/control.pdf).

annotator.md ADDED Viewed

	@@ -0,0 +1,49 @@

+# Automatic Annotations
+We provide gradio examples to obtain annotations that are aligned to our pretrained production-ready models.
+Just run
+    python gradio_annotator.py
+Since everyone has different habit to organize their datasets, we do not hard code any scripts for batch processing. But "gradio_annotator.py" is written in a super readable way, and modifying it to annotate your images should be easy.
+In the gradio UI of "gradio_annotator.py" we have the following interfaces:
+### Canny Edge
+Be careful about "black edge and white background" or "white edge and black background".
+![p](github_page/a1.png)
+### HED Edge
+Be careful about "black edge and white background" or "white edge and black background".
+![p](github_page/a2.png)
+### MLSD Edge
+Be careful about "black edge and white background" or "white edge and black background".
+![p](github_page/a3.png)
+### MIDAS Depth and Normal
+Be careful about RGB or BGR in normal maps.
+![p](github_page/a4.png)
+### Openpose
+Be careful about RGB or BGR in pose maps.
+For our production-ready model, the hand pose option is turned off.
+![p](github_page/a5.png)
+### Uniformer Segmentation
+Be careful about RGB or BGR in segmentation maps.
+![p](github_page/a6.png)

cog.yaml ADDED Viewed

	@@ -0,0 +1,51 @@

+# Configuration for Cog ⚙️
+# Reference: https://github.com/replicate/cog/blob/main/docs/yaml.md
+build:
+  # set to true if your model requires a GPU
+  gpu: true
+  # a list of ubuntu apt packages to install
+  system_packages:
+    - "python3-opencv"
+    # - "libgl1-mesa-glx"
+    # - "libglib2.0-0"
+  # python version in the form '3.8' or '3.8.12'
+  python_version: "3.8"
+  # a list of packages in the format <package-name>==<version>
+  # packages required: torch torchvision numpy gradio albumentations opencv-contrib-python imageio imageio-ffmpeg pytorch-lightning omegaconf test-tube streamlit einops transformers webdataset kornia open_clip_torch invisible-watermark streamlit-drawable-canvas torchmetrics timm addict yapf prettytable
+  python_packages:
+    - "torch==1.13.0"
+    - "torchvision==0.14.0"
+    - "numpy==1.21.6"
+    - "gradio==3.18.0"
+    - "albumentations==1.2.1"
+    - "opencv-contrib-python==4.6.0.66"
+    - "imageio==2.9.0"
+    - "imageio-ffmpeg==0.4.8"
+    - "pytorch-lightning==1.9.1"
+    - "omegaconf==2.3.0"
+    - "test-tube==0.7.5"
+    - "streamlit==1.18.1"
+    - "einops==0.6.0"
+    - "transformers==4.26.1"
+    - "webdataset==0.2.33"
+    - "kornia==0.6.9"
+    - "open_clip_torch==2.11.1"
+    - "invisible-watermark==0.1.5"
+    - "streamlit-drawable-canvas==0.9.2"
+    - "torchmetrics==0.11.1"
+    - "timm==0.6.12"
+    - "addict==2.4.0"
+    - "yapf==0.32.0"
+    - "prettytable==3.6.0"
+  # commands run after the environment is setup
+  # run:
+    # - "echo env is ready!"
+    # - "echo another command if needed"
+# predict.py defines how predictions are run on your model
+predict: "predict.py:Predictor"

download_weights.py ADDED Viewed

	@@ -0,0 +1,17 @@

+import argparse
+# add command line arg for model type
+parser = argparse.ArgumentParser()
+parser.add_argument("--model_type", type=str, default="canny", help="Model type to download")
+# add a binary flag to wipe the weights folder
+parser.add_argument("--wipe", action="store_true", help="Wipe the weights folder")
+args = parser.parse_args()
+MODEL_TYPE = args.model_type
+from utils import model_dl_urls, annotator_dl_urls, download_model
+for model_name in annotator_dl_urls.keys():
+    download_model(model_name, annotator_dl_urls)
+download_model(MODEL_TYPE, model_dl_urls)

environment.yaml ADDED Viewed

	@@ -0,0 +1,33 @@

+name: control
+channels:
+  - pytorch
+  - defaults
+dependencies:
+  - python=3.8.5
+  - pip=20.3
+  - cudatoolkit=11.3
+  - pytorch=1.12.1
+  - torchvision=0.13.1
+  - numpy=1.23.1
+  - pip:
+      - gradio==3.16.2
+      - albumentations==1.3.0
+      - opencv-contrib-python==4.3.0.36
+      - imageio==2.9.0
+      - imageio-ffmpeg==0.4.2
+      - pytorch-lightning==1.5.0
+      - omegaconf==2.1.1
+      - test-tube>=0.7.5
+      - streamlit==1.12.1
+      - einops==0.3.0
+      - transformers==4.19.2
+      - webdataset==0.2.5
+      - kornia==0.6
+      - open_clip_torch==2.0.2
+      - invisible-watermark>=0.1.5
+      - streamlit-drawable-canvas==0.8.0
+      - torchmetrics==0.6.0
+      - timm==0.6.12
+      - addict==2.4.0
+      - yapf==0.32.0
+      - prettytable==3.6.0

gradio_annotator.py ADDED Viewed

	@@ -0,0 +1,160 @@

+import gradio as gr
+from annotator.util import resize_image, HWC3
+model_canny = None
+def canny(img, res, l, h):
+    img = resize_image(HWC3(img), res)
+    global model_canny
+    if model_canny is None:
+        from annotator.canny import apply_canny
+        model_canny = apply_canny
+    result = model_canny(img, l, h)
+    return [result]
+model_hed = None
+def hed(img, res):
+    img = resize_image(HWC3(img), res)
+    global model_hed
+    if model_hed is None:
+        from annotator.hed import apply_hed
+        model_hed = apply_hed
+    result = model_hed(img)
+    return [result]
+model_mlsd = None
+def mlsd(img, res, thr_v, thr_d):
+    img = resize_image(HWC3(img), res)
+    global model_mlsd
+    if model_mlsd is None:
+        from annotator.mlsd import apply_mlsd
+        model_mlsd = apply_mlsd
+    result = model_mlsd(img, thr_v, thr_d)
+    return [result]
+model_midas = None
+def midas(img, res, a):
+    img = resize_image(HWC3(img), res)
+    global model_midas
+    if model_midas is None:
+        from annotator.midas import apply_midas
+        model_midas = apply_midas
+    results = model_midas(img, a)
+    return results
+model_openpose = None
+def openpose(img, res, has_hand):
+    img = resize_image(HWC3(img), res)
+    global model_openpose
+    if model_openpose is None:
+        from annotator.openpose import apply_openpose
+        model_openpose = apply_openpose
+    result, _ = model_openpose(img, has_hand)
+    return [result]
+model_uniformer = None
+def uniformer(img, res):
+    img = resize_image(HWC3(img), res)
+    global model_uniformer
+    if model_uniformer is None:
+        from annotator.uniformer import apply_uniformer
+        model_uniformer = apply_uniformer
+    result = model_uniformer(img)
+    return [result]
+block = gr.Blocks().queue()
+with block:
+    with gr.Row():
+        gr.Markdown("## Canny Edge")
+    with gr.Row():
+        with gr.Column():
+            input_image = gr.Image(source='upload', type="numpy")
+            low_threshold = gr.Slider(label="low_threshold", minimum=1, maximum=255, value=100, step=1)
+            high_threshold = gr.Slider(label="high_threshold", minimum=1, maximum=255, value=200, step=1)
+            resolution = gr.Slider(label="resolution", minimum=256, maximum=1024, value=512, step=64)
+            run_button = gr.Button(label="Run")
+        with gr.Column():
+            gallery = gr.Gallery(label="Generated images", show_label=False).style(height="auto")
+    run_button.click(fn=canny, inputs=[input_image, resolution, low_threshold, high_threshold], outputs=[gallery])
+    with gr.Row():
+        gr.Markdown("## HED Edge")
+    with gr.Row():
+        with gr.Column():
+            input_image = gr.Image(source='upload', type="numpy")
+            resolution = gr.Slider(label="resolution", minimum=256, maximum=1024, value=512, step=64)
+            run_button = gr.Button(label="Run")
+        with gr.Column():
+            gallery = gr.Gallery(label="Generated images", show_label=False).style(height="auto")
+    run_button.click(fn=hed, inputs=[input_image, resolution], outputs=[gallery])
+    with gr.Row():
+        gr.Markdown("## MLSD Edge")
+    with gr.Row():
+        with gr.Column():
+            input_image = gr.Image(source='upload', type="numpy")
+            value_threshold = gr.Slider(label="value_threshold", minimum=0.01, maximum=2.0, value=0.1, step=0.01)
+            distance_threshold = gr.Slider(label="distance_threshold", minimum=0.01, maximum=20.0, value=0.1, step=0.01)
+            resolution = gr.Slider(label="resolution", minimum=256, maximum=1024, value=384, step=64)
+            run_button = gr.Button(label="Run")
+        with gr.Column():
+            gallery = gr.Gallery(label="Generated images", show_label=False).style(height="auto")
+    run_button.click(fn=mlsd, inputs=[input_image, resolution, value_threshold, distance_threshold], outputs=[gallery])
+    with gr.Row():
+        gr.Markdown("## MIDAS Depth and Normal")
+    with gr.Row():
+        with gr.Column():
+            input_image = gr.Image(source='upload', type="numpy")
+            alpha = gr.Slider(label="alpha", minimum=0.1, maximum=20.0, value=6.2, step=0.01)
+            resolution = gr.Slider(label="resolution", minimum=256, maximum=1024, value=384, step=64)
+            run_button = gr.Button(label="Run")
+        with gr.Column():
+            gallery = gr.Gallery(label="Generated images", show_label=False).style(height="auto")
+    run_button.click(fn=midas, inputs=[input_image, resolution, alpha], outputs=[gallery])
+    with gr.Row():
+        gr.Markdown("## Openpose")
+    with gr.Row():
+        with gr.Column():
+            input_image = gr.Image(source='upload', type="numpy")
+            hand = gr.Checkbox(label='detect hand', value=False)
+            resolution = gr.Slider(label="resolution", minimum=256, maximum=1024, value=512, step=64)
+            run_button = gr.Button(label="Run")
+        with gr.Column():
+            gallery = gr.Gallery(label="Generated images", show_label=False).style(height="auto")
+    run_button.click(fn=openpose, inputs=[input_image, resolution, hand], outputs=[gallery])
+    with gr.Row():
+        gr.Markdown("## Uniformer Segmentation")
+    with gr.Row():
+        with gr.Column():
+            input_image = gr.Image(source='upload', type="numpy")
+            resolution = gr.Slider(label="resolution", minimum=256, maximum=1024, value=512, step=64)
+            run_button = gr.Button(label="Run")
+        with gr.Column():
+            gallery = gr.Gallery(label="Generated images", show_label=False).style(height="auto")
+    run_button.click(fn=uniformer, inputs=[input_image, resolution], outputs=[gallery])
+block.launch(server_name='0.0.0.0')

gradio_canny2image.py ADDED Viewed

	@@ -0,0 +1,42 @@

+import cv2
+import einops
+import gradio as gr
+import numpy as np
+import torch
+from cldm.hack import disable_verbosity
+disable_verbosity()
+from pytorch_lightning import seed_everything
+from annotator.util import resize_image, HWC3
+from annotator.canny import apply_canny
+from cldm.model import create_model, load_state_dict
+from ldm.models.diffusion.ddim import DDIMSampler
+def process_canny(input_image, prompt, a_prompt, n_prompt, num_samples, image_resolution, ddim_steps, scale, seed, eta, low_threshold, high_threshold, model, ddim_sampler):
+    with torch.no_grad():
+        img = resize_image(HWC3(input_image), image_resolution)
+        H, W, C = img.shape
+        detected_map = apply_canny(img, low_threshold, high_threshold)
+        detected_map = HWC3(detected_map)
+        control = torch.from_numpy(detected_map.copy()).float().cuda() / 255.0
+        control = torch.stack([control for _ in range(num_samples)], dim=0)
+        control = einops.rearrange(control, 'b h w c -> b c h w').clone()
+        seed_everything(seed)
+        cond = {"c_concat": [control], "c_crossattn": [model.get_learned_conditioning([prompt + ', ' + a_prompt] * num_samples)]}
+        un_cond = {"c_concat": [control], "c_crossattn": [model.get_learned_conditioning([n_prompt] * num_samples)]}
+        shape = (4, H // 8, W // 8)
+        samples, intermediates = ddim_sampler.sample(ddim_steps, num_samples,
+                                                     shape, cond, verbose=False, eta=eta,
+                                                     unconditional_guidance_scale=scale,
+                                                     unconditional_conditioning=un_cond)
+        x_samples = model.decode_first_stage(samples)
+        x_samples = (einops.rearrange(x_samples, 'b c h w -> b h w c') * 127.5 + 127.5).cpu().numpy().clip(0, 255).astype(np.uint8)
+        results = [x_samples[i] for i in range(num_samples)]
+    return [255 - detected_map] + results

gradio_depth2image.py ADDED Viewed

	@@ -0,0 +1,44 @@

+import cv2
+import einops
+import gradio as gr
+import numpy as np
+import torch
+from cldm.hack import disable_verbosity
+disable_verbosity()
+from pytorch_lightning import seed_everything
+from annotator.util import resize_image, HWC3
+from annotator.midas import apply_midas
+from cldm.model import create_model, load_state_dict
+from ldm.models.diffusion.ddim import DDIMSampler
+def process_depth(input_image, prompt, a_prompt, n_prompt, num_samples, image_resolution, detect_resolution, ddim_steps, scale, seed, eta, model, ddim_sampler):
+    with torch.no_grad():
+        input_image = HWC3(input_image)
+        detected_map, _ = apply_midas(resize_image(input_image, detect_resolution))
+        detected_map = HWC3(detected_map)
+        img = resize_image(input_image, image_resolution)
+        H, W, C = img.shape
+        detected_map = cv2.resize(detected_map, (W, H), interpolation=cv2.INTER_LINEAR)
+        control = torch.from_numpy(detected_map.copy()).float().cuda() / 255.0
+        control = torch.stack([control for _ in range(num_samples)], dim=0)
+        control = einops.rearrange(control, 'b h w c -> b c h w').clone()
+        seed_everything(seed)
+        cond = {"c_concat": [control], "c_crossattn": [model.get_learned_conditioning([prompt + ', ' + a_prompt] * num_samples)]}
+        un_cond = {"c_concat": [control], "c_crossattn": [model.get_learned_conditioning([n_prompt] * num_samples)]}
+        shape = (4, H // 8, W // 8)
+        samples, intermediates = ddim_sampler.sample(ddim_steps, num_samples,
+                                                     shape, cond, verbose=False, eta=eta,
+                                                     unconditional_guidance_scale=scale,
+                                                     unconditional_conditioning=un_cond)
+        x_samples = model.decode_first_stage(samples)
+        x_samples = (einops.rearrange(x_samples, 'b c h w -> b h w c') * 127.5 + 127.5).cpu().numpy().clip(0, 255).astype(np.uint8)
+        results = [x_samples[i] for i in range(num_samples)]
+    return [detected_map] + results

gradio_fake_scribble2image.py ADDED Viewed

	@@ -0,0 +1,48 @@

+import cv2
+import einops
+import gradio as gr
+import numpy as np
+import torch
+from cldm.hack import disable_verbosity
+disable_verbosity()
+from pytorch_lightning import seed_everything
+from annotator.util import resize_image, HWC3
+from annotator.hed import apply_hed, nms
+from cldm.model import create_model, load_state_dict
+from ldm.models.diffusion.ddim import DDIMSampler
+def process_scribble(input_image, prompt, a_prompt, n_prompt, num_samples, image_resolution, detect_resolution, ddim_steps, scale, seed, eta, model, ddim_sampler):
+    with torch.no_grad():
+        input_image = HWC3(input_image)
+        detected_map = apply_hed(resize_image(input_image, detect_resolution))
+        detected_map = HWC3(detected_map)
+        img = resize_image(input_image, image_resolution)
+        H, W, C = img.shape
+        detected_map = cv2.resize(detected_map, (W, H), interpolation=cv2.INTER_LINEAR)
+        detected_map = nms(detected_map, 127, 3.0)
+        detected_map = cv2.GaussianBlur(detected_map, (0, 0), 3.0)
+        detected_map[detected_map > 4] = 255
+        detected_map[detected_map < 255] = 0
+        control = torch.from_numpy(detected_map.copy()).float().cuda() / 255.0
+        control = torch.stack([control for _ in range(num_samples)], dim=0)
+        control = einops.rearrange(control, 'b h w c -> b c h w').clone()
+        seed_everything(seed)
+        cond = {"c_concat": [control], "c_crossattn": [model.get_learned_conditioning([prompt + ', ' + a_prompt] * num_samples)]}
+        un_cond = {"c_concat": [control], "c_crossattn": [model.get_learned_conditioning([n_prompt] * num_samples)]}
+        shape = (4, H // 8, W // 8)
+        samples, intermediates = ddim_sampler.sample(ddim_steps, num_samples,
+                                                     shape, cond, verbose=False, eta=eta,
+                                                     unconditional_guidance_scale=scale,
+                                                     unconditional_conditioning=un_cond)
+        x_samples = model.decode_first_stage(samples)
+        x_samples = (einops.rearrange(x_samples, 'b c h w -> b h w c') * 127.5 + 127.5).cpu().numpy().clip(0, 255).astype(np.uint8)
+        results = [x_samples[i] for i in range(num_samples)]
+    return [255 - detected_map] + results

gradio_hed2image.py ADDED Viewed

	@@ -0,0 +1,44 @@

+import cv2
+import einops
+import gradio as gr
+import numpy as np
+import torch
+from cldm.hack import disable_verbosity
+disable_verbosity()
+from pytorch_lightning import seed_everything
+from annotator.util import resize_image, HWC3
+from annotator.hed import apply_hed
+from cldm.model import create_model, load_state_dict
+from ldm.models.diffusion.ddim import DDIMSampler
+def process_hed(input_image, prompt, a_prompt, n_prompt, num_samples, image_resolution, detect_resolution, ddim_steps, scale, seed, eta, model, ddim_sampler):
+    with torch.no_grad():
+        input_image = HWC3(input_image)
+        detected_map = apply_hed(resize_image(input_image, detect_resolution))
+        detected_map = HWC3(detected_map)
+        img = resize_image(input_image, image_resolution)
+        H, W, C = img.shape
+        detected_map = cv2.resize(detected_map, (W, H), interpolation=cv2.INTER_LINEAR)
+        control = torch.from_numpy(detected_map.copy()).float().cuda() / 255.0
+        control = torch.stack([control for _ in range(num_samples)], dim=0)
+        control = einops.rearrange(control, 'b h w c -> b c h w').clone()
+        seed_everything(seed)
+        cond = {"c_concat": [control], "c_crossattn": [model.get_learned_conditioning([prompt + ', ' + a_prompt] * num_samples)]}
+        un_cond = {"c_concat": [control], "c_crossattn": [model.get_learned_conditioning([n_prompt] * num_samples)]}
+        shape = (4, H // 8, W // 8)
+        samples, intermediates = ddim_sampler.sample(ddim_steps, num_samples,
+                                                     shape, cond, verbose=False, eta=eta,
+                                                     unconditional_guidance_scale=scale,
+                                                     unconditional_conditioning=un_cond)
+        x_samples = model.decode_first_stage(samples)
+        x_samples = (einops.rearrange(x_samples, 'b c h w -> b h w c') * 127.5 + 127.5).cpu().numpy().clip(0, 255).astype(np.uint8)
+        results = [x_samples[i] for i in range(num_samples)]
+    return [detected_map] + results

gradio_hough2image.py ADDED Viewed

	@@ -0,0 +1,44 @@

+import cv2
+import einops
+import gradio as gr
+import numpy as np
+import torch
+from cldm.hack import disable_verbosity
+disable_verbosity()
+from pytorch_lightning import seed_everything
+from annotator.util import resize_image, HWC3
+from annotator.mlsd import apply_mlsd
+from cldm.model import create_model, load_state_dict
+from ldm.models.diffusion.ddim import DDIMSampler
+def process_mlsd(input_image, prompt, a_prompt, n_prompt, num_samples, image_resolution, detect_resolution, ddim_steps, scale, seed, eta, value_threshold, distance_threshold, model, ddim_sampler):
+    with torch.no_grad():
+        input_image = HWC3(input_image)
+        detected_map = apply_mlsd(resize_image(input_image, detect_resolution), value_threshold, distance_threshold)
+        detected_map = HWC3(detected_map)
+        img = resize_image(input_image, image_resolution)
+        H, W, C = img.shape
+        detected_map = cv2.resize(detected_map, (W, H), interpolation=cv2.INTER_NEAREST)
+        control = torch.from_numpy(detected_map.copy()).float().cuda() / 255.0
+        control = torch.stack([control for _ in range(num_samples)], dim=0)
+        control = einops.rearrange(control, 'b h w c -> b c h w').clone()
+        seed_everything(seed)
+        cond = {"c_concat": [control], "c_crossattn": [model.get_learned_conditioning([prompt + ', ' + a_prompt] * num_samples)]}
+        un_cond = {"c_concat": [control], "c_crossattn": [model.get_learned_conditioning([n_prompt] * num_samples)]}
+        shape = (4, H // 8, W // 8)
+        samples, intermediates = ddim_sampler.sample(ddim_steps, num_samples,
+                                                     shape, cond, verbose=False, eta=eta,
+                                                     unconditional_guidance_scale=scale,
+                                                     unconditional_conditioning=un_cond)
+        x_samples = model.decode_first_stage(samples)
+        x_samples = (einops.rearrange(x_samples, 'b c h w -> b h w c') * 127.5 + 127.5).cpu().numpy().clip(0, 255).astype(np.uint8)
+        results = [x_samples[i] for i in range(num_samples)]
+    return [255 - cv2.dilate(detected_map, np.ones(shape=(3, 3), dtype=np.uint8), iterations=1)] + results

gradio_normal2image.py ADDED Viewed

	@@ -0,0 +1,44 @@

+import cv2
+import einops
+import gradio as gr
+import numpy as np
+import torch
+from cldm.hack import disable_verbosity
+disable_verbosity()
+from pytorch_lightning import seed_everything
+from annotator.util import resize_image, HWC3
+from annotator.midas import apply_midas
+from cldm.model import create_model, load_state_dict
+from ldm.models.diffusion.ddim import DDIMSampler
+def process_normal(input_image, prompt, a_prompt, n_prompt, num_samples, image_resolution, detect_resolution, ddim_steps, scale, seed, eta, bg_threshold, model, ddim_sampler):
+    with torch.no_grad():
+        input_image = HWC3(input_image)
+        _, detected_map = apply_midas(resize_image(input_image, detect_resolution), bg_th=bg_threshold)
+        detected_map = HWC3(detected_map)
+        img = resize_image(input_image, image_resolution)
+        H, W, C = img.shape
+        detected_map = cv2.resize(detected_map, (W, H), interpolation=cv2.INTER_LINEAR)
+        control = torch.from_numpy(detected_map[:, :, ::-1].copy()).float().cuda() / 255.0
+        control = torch.stack([control for _ in range(num_samples)], dim=0)
+        control = einops.rearrange(control, 'b h w c -> b c h w').clone()
+        seed_everything(seed)
+        cond = {"c_concat": [control], "c_crossattn": [model.get_learned_conditioning([prompt + ', ' + a_prompt] * num_samples)]}
+        un_cond = {"c_concat": [control], "c_crossattn": [model.get_learned_conditioning([n_prompt] * num_samples)]}
+        shape = (4, H // 8, W // 8)
+        samples, intermediates = ddim_sampler.sample(ddim_steps, num_samples,
+                                                     shape, cond, verbose=False, eta=eta,
+                                                     unconditional_guidance_scale=scale,
+                                                     unconditional_conditioning=un_cond)
+        x_samples = model.decode_first_stage(samples)
+        x_samples = (einops.rearrange(x_samples, 'b c h w -> b h w c') * 127.5 + 127.5).cpu().numpy().clip(0, 255).astype(np.uint8)
+        results = [x_samples[i] for i in range(num_samples)]
+    return [detected_map] + results

gradio_pose2image.py ADDED Viewed

	@@ -0,0 +1,44 @@

+import cv2
+import einops
+import gradio as gr
+import numpy as np
+import torch
+from cldm.hack import disable_verbosity
+disable_verbosity()
+from pytorch_lightning import seed_everything
+from annotator.util import resize_image, HWC3
+from annotator.openpose import apply_openpose
+from cldm.model import create_model, load_state_dict
+from ldm.models.diffusion.ddim import DDIMSampler
+def process_pose(input_image, prompt, a_prompt, n_prompt, num_samples, image_resolution, detect_resolution, ddim_steps, scale, seed, eta, model, ddim_sampler):
+    with torch.no_grad():
+        input_image = HWC3(input_image)
+        detected_map, _ = apply_openpose(resize_image(input_image, detect_resolution))
+        detected_map = HWC3(detected_map)
+        img = resize_image(input_image, image_resolution)
+        H, W, C = img.shape
+        detected_map = cv2.resize(detected_map, (W, H), interpolation=cv2.INTER_NEAREST)
+        control = torch.from_numpy(detected_map.copy()).float().cuda() / 255.0
+        control = torch.stack([control for _ in range(num_samples)], dim=0)
+        control = einops.rearrange(control, 'b h w c -> b c h w').clone()
+        seed_everything(seed)
+        cond = {"c_concat": [control], "c_crossattn": [model.get_learned_conditioning([prompt + ', ' + a_prompt] * num_samples)]}
+        un_cond = {"c_concat": [control], "c_crossattn": [model.get_learned_conditioning([n_prompt] * num_samples)]}
+        shape = (4, H // 8, W // 8)
+        samples, intermediates = ddim_sampler.sample(ddim_steps, num_samples,
+                                                     shape, cond, verbose=False, eta=eta,
+                                                     unconditional_guidance_scale=scale,
+                                                     unconditional_conditioning=un_cond)
+        x_samples = model.decode_first_stage(samples)
+        x_samples = (einops.rearrange(x_samples, 'b c h w -> b h w c') * 127.5 + 127.5).cpu().numpy().clip(0, 255).astype(np.uint8)
+        results = [x_samples[i] for i in range(num_samples)]
+    return [detected_map] + results

gradio_scribble2image.py ADDED Viewed

	@@ -0,0 +1,41 @@

+import cv2
+import einops
+import gradio as gr
+import numpy as np
+import torch
+from cldm.hack import disable_verbosity
+disable_verbosity()
+from pytorch_lightning import seed_everything
+from annotator.util import resize_image, HWC3
+from cldm.model import create_model, load_state_dict
+from ldm.models.diffusion.ddim import DDIMSampler
+def process_scribble(input_image, prompt, a_prompt, n_prompt, num_samples, image_resolution, ddim_steps, scale, seed, eta, model, ddim_sampler):
+    with torch.no_grad():
+        img = resize_image(HWC3(input_image), image_resolution)
+        H, W, C = img.shape
+        detected_map = np.zeros_like(img, dtype=np.uint8)
+        detected_map[np.min(img, axis=2) < 127] = 255
+        control = torch.from_numpy(detected_map.copy()).float().cuda() / 255.0
+        control = torch.stack([control for _ in range(num_samples)], dim=0)
+        control = einops.rearrange(control, 'b h w c -> b c h w').clone()
+        seed_everything(seed)
+        cond = {"c_concat": [control], "c_crossattn": [model.get_learned_conditioning([prompt + ', ' + a_prompt] * num_samples)]}
+        un_cond = {"c_concat": [control], "c_crossattn": [model.get_learned_conditioning([n_prompt] * num_samples)]}
+        shape = (4, H // 8, W // 8)
+        samples, intermediates = ddim_sampler.sample(ddim_steps, num_samples,
+                                                     shape, cond, verbose=False, eta=eta,
+                                                     unconditional_guidance_scale=scale,
+                                                     unconditional_conditioning=un_cond)
+        x_samples = model.decode_first_stage(samples)
+        x_samples = (einops.rearrange(x_samples, 'b c h w -> b h w c') * 127.5 + 127.5).cpu().numpy().clip(0, 255).astype(np.uint8)
+        results = [x_samples[i] for i in range(num_samples)]
+    return [255 - detected_map] + results

gradio_scribble2image_interactive.py ADDED Viewed

	@@ -0,0 +1,45 @@

+import cv2
+import einops
+import gradio as gr
+import numpy as np
+import torch
+from cldm.hack import disable_verbosity
+disable_verbosity()
+from pytorch_lightning import seed_everything
+from annotator.util import resize_image, HWC3
+from cldm.model import create_model, load_state_dict
+from ldm.models.diffusion.ddim import DDIMSampler
+def process(input_image, prompt, a_prompt, n_prompt, num_samples, image_resolution, ddim_steps, scale, seed, eta):
+    with torch.no_grad():
+        img = resize_image(HWC3(input_image['mask'][:, :, 0]), image_resolution)
+        H, W, C = img.shape
+        detected_map = np.zeros_like(img, dtype=np.uint8)
+        detected_map[np.min(img, axis=2) > 127] = 255
+        control = torch.from_numpy(detected_map.copy()).float().cuda() / 255.0
+        control = torch.stack([control for _ in range(num_samples)], dim=0)
+        control = einops.rearrange(control, 'b h w c -> b c h w').clone()
+        seed_everything(seed)
+        cond = {"c_concat": [control], "c_crossattn": [model.get_learned_conditioning([prompt + ', ' + a_prompt] * num_samples)]}
+        un_cond = {"c_concat": [control], "c_crossattn": [model.get_learned_conditioning([n_prompt] * num_samples)]}
+        shape = (4, H // 8, W // 8)
+        samples, intermediates = ddim_sampler.sample(ddim_steps, num_samples,
+                                                     shape, cond, verbose=False, eta=eta,
+                                                     unconditional_guidance_scale=scale,
+                                                     unconditional_conditioning=un_cond)
+        x_samples = model.decode_first_stage(samples)
+        x_samples = (einops.rearrange(x_samples, 'b c h w -> b h w c') * 127.5 + 127.5).cpu().numpy().clip(0, 255).astype(np.uint8)
+        results = [x_samples[i] for i in range(num_samples)]
+    return [255 - detected_map] + results
+def create_canvas(w, h):
+    return np.zeros(shape=(h, w, 3), dtype=np.uint8) + 255

gradio_seg2image.py ADDED Viewed

	@@ -0,0 +1,43 @@

+import cv2
+import einops
+import gradio as gr
+import numpy as np
+import torch
+from cldm.hack import disable_verbosity
+disable_verbosity()
+from pytorch_lightning import seed_everything
+from annotator.util import resize_image, HWC3
+from annotator.uniformer import apply_uniformer
+from cldm.model import create_model, load_state_dict
+from ldm.models.diffusion.ddim import DDIMSampler
+def process_seg(input_image, prompt, a_prompt, n_prompt, num_samples, image_resolution, detect_resolution, ddim_steps, scale, seed, eta, model, ddim_sampler):
+    with torch.no_grad():
+        input_image = HWC3(input_image)
+        detected_map = apply_uniformer(resize_image(input_image, detect_resolution))
+        img = resize_image(input_image, image_resolution)
+        H, W, C = img.shape
+        detected_map = cv2.resize(detected_map, (W, H), interpolation=cv2.INTER_NEAREST)
+        control = torch.from_numpy(detected_map.copy()).float().cuda() / 255.0
+        control = torch.stack([control for _ in range(num_samples)], dim=0)
+        control = einops.rearrange(control, 'b h w c -> b c h w').clone()
+        seed_everything(seed)
+        cond = {"c_concat": [control], "c_crossattn": [model.get_learned_conditioning([prompt + ', ' + a_prompt] * num_samples)]}
+        un_cond = {"c_concat": [control], "c_crossattn": [model.get_learned_conditioning([n_prompt] * num_samples)]}
+        shape = (4, H // 8, W // 8)
+        samples, intermediates = ddim_sampler.sample(ddim_steps, num_samples,
+                                                     shape, cond, verbose=False, eta=eta,
+                                                     unconditional_guidance_scale=scale,
+                                                     unconditional_conditioning=un_cond)
+        x_samples = model.decode_first_stage(samples)
+        x_samples = (einops.rearrange(x_samples, 'b c h w -> b h w c') * 127.5 + 127.5).cpu().numpy().clip(0, 255).astype(np.uint8)
+        results = [x_samples[i] for i in range(num_samples)]
+    return [detected_map] + results

output.0.png ADDED Viewed

output.1.png ADDED Viewed

predict.py ADDED Viewed

	@@ -0,0 +1,216 @@

+# Prediction interface for Cog ⚙️
+# https://github.com/replicate/cog/resolve/main/docs/python.md
+from cog import BasePredictor, Input, Path
+import os
+from subprocess import call
+from cldm.model import create_model, load_state_dict
+from ldm.models.diffusion.ddim import DDIMSampler
+from PIL import Image
+import numpy as np
+from typing import List
+from utils import get_state_dict_path, download_model, model_dl_urls, annotator_dl_urls
+MODEL_TYPE = "openpose"
+if MODEL_TYPE == "canny":
+    from gradio_canny2image import process_canny
+elif MODEL_TYPE == "depth":
+    from gradio_depth2image import process_depth
+elif MODEL_TYPE == "hed":
+    from gradio_hed2image import process_hed
+elif MODEL_TYPE == "normal":
+    from gradio_normal2image import process_normal
+elif MODEL_TYPE == "mlsd":
+    from gradio_hough2image import process_mlsd
+elif MODEL_TYPE == "scribble":
+    from gradio_scribble2image import process_scribble
+elif MODEL_TYPE == "seg":
+    from gradio_seg2image import process_seg
+elif MODEL_TYPE == "openpose":
+    from gradio_pose2image import process_pose
+class Predictor(BasePredictor):
+    def setup(self):
+        """Load the model into memory to make running multiple predictions efficient"""
+        self.model = create_model('./models/cldm_v15.yaml').cuda()
+        self.model.load_state_dict(load_state_dict(get_state_dict_path(MODEL_TYPE), location='cuda'))
+        self.ddim_sampler = DDIMSampler(self.model)
+    def predict(
+        self,
+        image: Path = Input(description="Input image"),
+        prompt: str = Input(description="Prompt for the model"),
+        num_samples: str = Input(
+            description="Number of samples (higher values may OOM)",
+            choices=['1', '4'],
+            default='1'
+        ),
+        image_resolution: str = Input(
+            description="Image resolution to be generated",
+            choices = ['256', '512', '768'],
+            default='512'
+        ),
+        low_threshold: int = Input(description="Canny line detection low threshold", default=100, ge=1, le=255), # only applicable when model type is 'canny'
+        high_threshold: int = Input(description="Canny line detection high threshold", default=200, ge=1, le=255), # only applicable when model type is 'canny'
+        ddim_steps: int = Input(description="Steps", default=20),
+        scale: float = Input(description="Scale for classifier-free guidance", default=9.0, ge=0.1, le=30.0),
+        seed: int = Input(description="Seed", default=None),
+        eta: float = Input(description="Controls the amount of noise that is added to the input data during the denoising diffusion process. Higher value -> more noise", default=0.0),
+        a_prompt: str = Input(description="Additional text to be appended to prompt", default="best quality, extremely detailed"),
+        n_prompt: str = Input(description="Negative Prompt", default="longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality"),
+        detect_resolution: int = Input(description="Resolution at which detection method will be applied)", default=512, ge=128, le=1024), # only applicable when model type is 'HED', 'seg', or 'MLSD'
+        # bg_threshold: float = Input(description="Background Threshold (only applicable when model type is 'normal')", default=0.0, ge=0.0, le=1.0), # only applicable when model type is 'normal'
+        # value_threshold: float = Input(description="Value Threshold (only applicable when model type is 'MLSD')", default=0.1, ge=0.01, le=2.0), # only applicable when model type is 'MLSD'
+        # distance_threshold: float = Input(description="Distance Threshold (only applicable when model type is 'MLSD')", default=0.1, ge=0.01, le=20.0), # only applicable when model type is 'MLSD'
+    ) -> List[Path]:
+        """Run a single prediction on the model"""
+        num_samples = int(num_samples)
+        image_resolution = int(image_resolution)
+        if not seed:
+            seed = np.random.randint(1000000)
+        else:
+            seed = int(seed)
+        # load input_image
+        input_image = Image.open(image)
+        # convert to numpy
+        input_image = np.array(input_image)
+        if MODEL_TYPE == "canny":
+            outputs = process_canny(
+                input_image,
+                prompt,
+                a_prompt,
+                n_prompt,
+                num_samples,
+                image_resolution,
+                ddim_steps,
+                scale,
+                seed,
+                eta,
+                low_threshold,
+                high_threshold,
+                self.model,
+                self.ddim_sampler,
+            )
+        elif MODEL_TYPE == "depth":
+            outputs = process_depth(
+                input_image,
+                prompt,
+                a_prompt,
+                n_prompt,
+                num_samples,
+                image_resolution,
+                detect_resolution,
+                ddim_steps,
+                scale,
+                seed,
+                eta,
+                self.model,
+                self.ddim_sampler,
+            )
+        elif MODEL_TYPE == "hed":
+            outputs = process_hed(
+                input_image,
+                prompt,
+                a_prompt,
+                n_prompt,
+                num_samples,
+                image_resolution,
+                detect_resolution,
+                ddim_steps,
+                scale,
+                seed,
+                eta,
+                self.model,
+                self.ddim_sampler,
+            )
+        elif MODEL_TYPE == "normal":
+            outputs = process_normal(
+                input_image,
+                prompt,
+                a_prompt,
+                n_prompt,
+                num_samples,
+                image_resolution,
+                ddim_steps,
+                scale,
+                seed,
+                eta,
+                bg_threshold,
+                self.model,
+                self.ddim_sampler,
+            )
+        elif MODEL_TYPE == "mlsd":
+            outputs = process_mlsd(
+                input_image,
+                prompt,
+                a_prompt,
+                n_prompt,
+                num_samples,
+                image_resolution,
+                detect_resolution,
+                ddim_steps,
+                scale,
+                seed,
+                eta,
+                value_threshold,
+                distance_threshold,
+                self.model,
+                self.ddim_sampler,
+            )
+        elif MODEL_TYPE == "scribble":
+            outputs = process_scribble(
+                input_image,
+                prompt,
+                a_prompt,
+                n_prompt,
+                num_samples,
+                image_resolution,
+                ddim_steps,
+                scale,
+                seed,
+                eta,
+                self.model,
+                self.ddim_sampler,
+            )
+        elif MODEL_TYPE == "seg":
+            outputs = process_seg(
+                input_image,
+                prompt,
+                a_prompt,
+                n_prompt,
+                num_samples,
+                image_resolution,
+                detect_resolution,
+                ddim_steps,
+                scale,
+                seed,
+                eta,
+                self.model,
+                self.ddim_sampler,
+            )
+        elif MODEL_TYPE == "openpose":
+            outputs = process_pose(
+                input_image,
+                prompt,
+                a_prompt,
+                n_prompt,
+                num_samples,
+                image_resolution,
+                detect_resolution,
+                ddim_steps,
+                scale,
+                seed,
+                eta,
+                self.model,
+                self.ddim_sampler,
+            )
+        # outputs from list to PIL
+        outputs = [Image.fromarray(output) for output in outputs]
+        # save outputs to file
+        outputs = [output.save(f"tmp/output_{i}.png") for i, output in enumerate(outputs)]
+        # return paths to output files
+        return [Path(f"tmp/output_{i}.png") for i in range(len(outputs))]

tool_add_control.py ADDED Viewed

	@@ -0,0 +1,49 @@

+import sys
+import os
+assert len(sys.argv) == 3, 'Args are wrong.'
+input_path = sys.argv[1]
+output_path = sys.argv[2]
+assert os.path.exists(input_path), 'Input model does not exist.'
+assert not os.path.exists(output_path), 'Output filename already exists.'
+assert os.path.exists(os.path.dirname(output_path)), 'Output path is not valid.'
+import torch
+from cldm.model import create_model
+def get_node_name(name, parent_name):
+    if len(name) <= len(parent_name):
+        return False, ''
+    p = name[:len(parent_name)]
+    if p != parent_name:
+        return False, ''
+    return True, name[len(parent_name):]
+model = create_model(config_path='./models/cldm_v15.yaml')
+pretrained_weights = torch.load(input_path)
+if 'state_dict' in pretrained_weights:
+    pretrained_weights = pretrained_weights['state_dict']
+scratch_dict = model.state_dict()
+target_dict = {}
+for k in scratch_dict.keys():
+    is_control, name = get_node_name(k, 'control_')
+    if is_control:
+        copy_k = 'model.diffusion_' + name
+    else:
+        copy_k = k
+    if copy_k in pretrained_weights:
+        target_dict[k] = pretrained_weights[copy_k].clone()
+    else:
+        target_dict[k] = scratch_dict[k].clone()
+        print(f'These weights are newly added: {k}')
+model.load_state_dict(target_dict, strict=True)
+torch.save(model.state_dict(), output_path)
+print('Done.')

train.md ADDED Viewed

	@@ -0,0 +1,251 @@

+# Train a ControlNet to Control SD
+You are here because you want to control SD in your own way, maybe you have an idea for your perfect research project, and you will annotate some data or have already annotated your own dataset automatically or manually. Herein, the control can be anything that can be converted to images, such as edges, keypoints, segments, etc.
+Before moving on to your own dataset, we highly recommend to first try the toy dataset, Fill50K, as a sanity check. This will help you get a "feeling" for the training. You will know how long it will take for the model to converge and whether your device will be able to complete the training in an acceptable amount of time. And what it "feels" like when the model converges.
+We hope that after you read this page, you will find that training a ControlNet is as easy as (or easier than) training a pix2pix.
+## Step 0 - Design your control
+Let us take a look at a very simple task to control SD to fill color in circles.
+![p](github_page/t1.png)
+This is simple: we want to control SD to fill a circle with colors, and the prompt contains some description of our target.
+Stable diffusion is trained on billions of images, and it already knows what is "cyan", what is "circle", what is "pink", and what is "background".
+But it does not know the meaning of that "Control Image (Source Image)". Our target is to let it know.
+## Step 1 - Get a dataset
+Just download the Fill50K dataset from [our huggingface page](https://huggingface.co/lllyasviel/ControlNet) (training/fill50k.zip, the file is only 200M!). Make sure that the data is decompressed as
+    ControlNet/training/fill50k/prompt.json
+    ControlNet/training/fill50k/source/X.png
+    ControlNet/training/fill50k/target/X.png
+In the folder "fill50k/source", you will have 50k images of circle lines.
+![p](github_page/t2.png)
+In the folder "fill50k/target", you will have 50k images of filled circles.
+![p](github_page/t3.png)
+In the "fill50k/prompt.json", you will have their filenames and prompts. Each prompt is like "a balabala color circle in some other color background."
+![p](github_page/t4.png)
+## Step 2 - Load the dataset
+Then you need to write a simple script to read this dataset for pytorch. (In fact we have written it for you in "tutorial_dataset.py".)
+```python
+import json
+import cv2
+import numpy as np
+from torch.utils.data import Dataset
+class MyDataset(Dataset):
+    def __init__(self):
+        self.data = []
+        with open('./training/fill50k/prompt.json', 'rt') as f:
+            for line in f:
+                self.data.append(json.loads(line))
+    def __len__(self):
+        return len(self.data)
+    def __getitem__(self, idx):
+        item = self.data[idx]
+        source_filename = item['source']
+        target_filename = item['target']
+        prompt = item['prompt']
+        source = cv2.imread('./training/fill50k/' + source_filename)
+        target = cv2.imread('./training/fill50k/' + target_filename)
+        # Do not forget that OpenCV read images in BGR order.
+        source = cv2.cvtColor(source, cv2.COLOR_BGR2RGB)
+        target = cv2.cvtColor(target, cv2.COLOR_BGR2RGB)
+        # Normalize images to [-1, 1].
+        source = (source.astype(np.float32) / 127.5) - 1.0
+        target = (target.astype(np.float32) / 127.5) - 1.0
+        return dict(jpg=target, txt=prompt, hint=source)
+```
+This will make your dataset into an array-like object in python. You can test this dataset simply by accessing the array, like this
+```python
+from tutorial_dataset import MyDataset
+dataset = MyDataset()
+print(len(dataset))
+item = dataset[1234]
+jpg = item['jpg']
+txt = item['txt']
+hint = item['hint']
+print(txt)
+print(jpg.shape)
+print(hint.shape)
+```
+The outputs of this simple test on my machine are
+    50000
+    burly wood circle with orange background
+    (512, 512, 3)
+    (512, 512, 3)
+And this code is in "tutorial_dataset_test.py".
+In this way, the dataset is an array-like object with 50000 items. Each item is a dict with three entry "jpg", "txt", and "hint". The "jpg" is the target image, the "hint" is the control image, and the "txt" is the prompt.
+Do not ask us why we use these three names - this is related to the dark history of a library called LDM.
+## Step 3 - What SD model do you want to control?
+Then you need to decide which Stable Diffusion Model you want to control. In this example, we will just use standard SD1.5. You can download it from the [official page of Stability](https://huggingface.co/runwayml/stable-diffusion-v1-5/tree/main). You want the file "v1-5-pruned.ckpt".
+Then you need to attach a control net to the SD model. The architecture is
+![img](github_page/sd.png)
+Note that all weights inside the ControlNet are also copied from SD so that no layer is trained from scratch, and you are still finetuning the entire model.
+We provide a simple script for you to achieve this easily. If your SD filename is "./models/v1-5-pruned.ckpt" and you want the script to save the processed model (SD+ControlNet) at location "./models/control_sd15_ini.ckpt", you can just run:
+    python tool_add_control.py ./models/v1-5-pruned.ckpt ./models/control_sd15_ini.ckpt
+You may also use other filenames as long as the command is "python tool_add_control.py input_path output_path".
+This is the correct output from my machine:
+![img](github_page/t5.png)
+## Step 4 - Train!
+Happy! We finally come to the most exciting part: training!
+The training code in "tutorial_train.py" is actually surprisingly simple:
+```python
+import pytorch_lightning as pl
+from torch.utils.data import DataLoader
+from tutorial_dataset import MyDataset
+from cldm.logger import ImageLogger
+from cldm.model import create_model, load_state_dict
+# Configs
+resume_path = './models/control_sd15_ini.ckpt'
+batch_size = 4
+logger_freq = 300
+learning_rate = 1e-5
+sd_locked = True
+only_mid_control = False
+# First use cpu to load models. Pytorch Lightning will automatically move it to GPUs.
+model = create_model('./models/cldm_v15.yaml').cpu()
+model.load_state_dict(load_state_dict(resume_path, location='cpu'))
+model.learning_rate = learning_rate
+model.sd_locked = sd_locked
+model.only_mid_control = only_mid_control
+# Misc
+dataset = MyDataset()
+dataloader = DataLoader(dataset, num_workers=0, batch_size=batch_size, shuffle=True)
+logger = ImageLogger(batch_frequency=logger_freq)
+trainer = pl.Trainer(gpus=1, precision=32, callbacks=[logger])
+# Train!
+trainer.fit(model, dataloader)
+```
+Thanks to our organized dataset pytorch object and the power of pytorch_lightning, the entire code is just super short.
+Now, you may take a look at [Pytorch Lightning Official DOC](https://pytorch-lightning.readthedocs.io/en/latest/api/pytorch_lightning.trainer.trainer.Trainer.html?highlight=trainer) to find out how to enable many useful features like gradient accumulation, multiple GPU training, accelerated dataset loading, flexible checkpoint saving, etc. All these only need about one line of code. Great!
+Note that if you find OOM, perhaps you need to use smaller batch size and gradient accumulation. Or you may also want to use some “advanced” tricks like sliced attention or xformers. For example:
+```python
+# Configs
+batch_size = 1
+# Misc
+trainer = pl.Trainer(gpus=1, precision=32, callbacks=[logger], accumulate_grad_batches=4)  # But this will be 4x slower
+```
+Note that training with 8 GB laptop GPU is challenging. We will need some GPU memory optimization at least as good as automatic1111’s UI. This may require expert modifications to the code.
+### Screenshots
+The training is fast. After 4000 steps (batch size 4, learning rate 1e-5, about 50 minutes on PCIE 40G), the results on my machine (in an output folder "image_log") is
+Control:
+![img](github_page/t/ip.png)
+Prompt:
+![img](github_page/t/t.png)
+Prediction:
+![img](github_page/t/op.png)
+Ground Truth:
+![img](github_page/t/gt.png)
+Note that the SD's capability is preserved. Even training on this super aligned dataset, it still draws some random textures and those snow decorations. (Besides, note that the ground truth looks a bit modified because it is converted from SD's latent image.)
+Larger batch size and longer training will further improve this. Adequate training will make the filling perfect.
+Of course, training SD to fill circles is meaningless, but this is a successful beginning of your story.
+Let us work together to control large models more and more.
+## Other options
+Beyond standard things, we also provide two important parameters "sd_locked" and "only_mid_control" that you need to know.
+### only_mid_control
+By default, only_mid_control is False. When it is True, you will train the below architecture.
+![img](github_page/t6.png)
+This can be helpful when your computation power is limited and want to speed up the training, or when you want to facilitate the "global" context learning. Note that sometimes you may pause training, set it to True, resume training, and pause again, and set it again, and resume again.
+If your computation device is good, perhaps you do not need this. But I also know some artists are willing to train a model on their laptop for a month - in that case, perhaps this option can be useful.
+### sd_locked
+By default, sd_locked is True. When it is False, you will train the below architecture.
+![img](github_page/t7.png)
+This will unlock some layers in SD and you will train them as a whole.
+This option is DANGEROUS! If your dataset is not good enough, this may downgrade the capability of your SD model.
+However, this option is also very useful when you are training on images with some specific style, or when you are training with special datasets (like medical dataset with X-ray images or geographic datasets with lots of Google Maps). You can understand this as simultaneously training the ControlNet and something like a DreamBooth.
+Also, if your dataset is large, you may want to end the training with a few thousands of steps with those layer unlocked. This usually improve the "problem-specific" solutions a little. You may try it yourself to feel the difference.
+Also, if you unlock some original layers, you may want a lower learning rate, like 2e-6.

tutorial_dataset.py ADDED Viewed

	@@ -0,0 +1,37 @@

+import json
+import cv2
+import numpy as np
+from torch.utils.data import Dataset
+class MyDataset(Dataset):
+    def __init__(self):
+        self.data = []
+        with open('./training/fill50k/prompt.json', 'rt') as f:
+            for line in f:
+                self.data.append(json.loads(line))
+    def __len__(self):
+        return len(self.data)
+    def __getitem__(self, idx):
+        item = self.data[idx]
+        source_filename = item['source']
+        target_filename = item['target']
+        prompt = item['prompt']
+        source = cv2.imread('./training/fill50k/' + source_filename)
+        target = cv2.imread('./training/fill50k/' + target_filename)
+        # Do not forget that OpenCV read images in BGR order.
+        source = cv2.cvtColor(source, cv2.COLOR_BGR2RGB)
+        target = cv2.cvtColor(target, cv2.COLOR_BGR2RGB)
+        # Normalize images to [-1, 1].
+        source = (source.astype(np.float32) / 127.5) - 1.0
+        target = (target.astype(np.float32) / 127.5) - 1.0
+        return dict(jpg=target, txt=prompt, hint=source)

tutorial_dataset_test.py ADDED Viewed

	@@ -0,0 +1,12 @@

+from tutorial_dataset import MyDataset
+dataset = MyDataset()
+print(len(dataset))
+item = dataset[1234]
+jpg = item['jpg']
+txt = item['txt']
+hint = item['hint']
+print(txt)
+print(jpg.shape)
+print(hint.shape)

tutorial_train.py ADDED Viewed

	@@ -0,0 +1,36 @@

+from cldm.hack import disable_verbosity
+disable_verbosity()
+import pytorch_lightning as pl
+from torch.utils.data import DataLoader
+from tutorial_dataset import MyDataset
+from cldm.logger import ImageLogger
+from cldm.model import create_model, load_state_dict
+# Configs
+resume_path = './models/control_sd15_ini.ckpt'
+batch_size = 4
+logger_freq = 300
+learning_rate = 1e-5
+sd_locked = True
+only_mid_control = False
+# First use cpu to load models. Pytorch Lightning will automatically move it to GPUs.
+model = create_model('./models/cldm_v15.yaml').cpu()
+model.load_state_dict(load_state_dict(resume_path, location='cpu'))
+model.learning_rate = learning_rate
+model.sd_locked = sd_locked
+model.only_mid_control = only_mid_control
+# Misc
+dataset = MyDataset()
+dataloader = DataLoader(dataset, num_workers=0, batch_size=batch_size, shuffle=True)
+logger = ImageLogger(batch_frequency=logger_freq)
+trainer = pl.Trainer(gpus=1, precision=32, callbacks=[logger])
+# Train!
+trainer.fit(model, dataloader)

utils.py ADDED Viewed

	@@ -0,0 +1,39 @@

+import os
+from subprocess import call
+model_dl_urls = {
+    "canny": "https://huggingface.co/lllyasviel/ControlNet/resolve/main/models/control_sd15_canny.pth",
+    "depth": "https://huggingface.co/lllyasviel/ControlNet/resolve/main/models/control_sd15_depth.pth",
+    "hed": "https://huggingface.co/lllyasviel/ControlNet/resolve/main/models/control_sd15_hed.pth",
+    "normal": "https://huggingface.co/lllyasviel/ControlNet/resolve/main/models/control_sd15_normal.pth",
+    "mlsd": "https://huggingface.co/lllyasviel/ControlNet/resolve/main/models/control_sd15_mlsd.pth",
+    "openpose": "https://huggingface.co/lllyasviel/ControlNet/resolve/main/models/control_sd15_openpose.pth",
+    "scribble": "https://huggingface.co/lllyasviel/ControlNet/resolve/main/models/control_sd15_scribble.pth",
+    "seg": "https://huggingface.co/lllyasviel/ControlNet/resolve/main/models/control_sd15_seg.pth",
+}
+annotator_dl_urls = {
+    "body_pose_model.pth": "https://huggingface.co/lllyasviel/ControlNet/resolve/main/annotator/ckpts/body_pose_model.pth",
+    "dpt_hybrid-midas-501f0c75.pt": "https://huggingface.co/lllyasviel/ControlNet/resolve/main/annotator/ckpts/dpt_hybrid-midas-501f0c75.pt",
+    "hand_pose_model.pth": "https://huggingface.co/lllyasviel/ControlNet/resolve/main/annotator/ckpts/hand_pose_model.pth",
+    "mlsd_large_512_fp32.pth": "https://huggingface.co/lllyasviel/ControlNet/resolve/main/annotator/ckpts/mlsd_large_512_fp32.pth",
+    "mlsd_tiny_512_fp32.pth": "https://huggingface.co/lllyasviel/ControlNet/resolve/main/annotator/ckpts/mlsd_tiny_512_fp32.pth",
+    "network-bsds500.pth": "https://huggingface.co/lllyasviel/ControlNet/resolve/main/annotator/ckpts/network-bsds500.pth",
+    "upernet_global_small.pth": "https://huggingface.co/lllyasviel/ControlNet/resolve/main/annotator/ckpts/upernet_global_small.pth",
+}
+def download_model(model_name, urls_map):
+    """
+    Download model from huggingface with wget and save to models directory
+    """
+    model_url = urls_map[model_name]
+    relative_path_to_model = model_url.replace("https://huggingface.co/lllyasviel/ControlNet/resolve/main/", "")
+    if not os.path.exists(relative_path_to_model):
+        print(f"Downloading {model_name}...")
+        call(["wget", "-O", relative_path_to_model, model_url])
+def get_state_dict_path(model_name):
+    """
+    Get path to model state dict
+    """
+    return f"./models/control_sd15_{model_name}.pth"