NeMo_Canary / scripts /flux /flux_infer.py

Upload folder using huggingface_hub

b386992 verified 9 months ago

5.36 kB

	# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	import argparse

	import os
	import torch

	from nemo.collections.diffusion.models.flux.pipeline import FluxInferencePipeline
	from nemo.collections.diffusion.utils.flux_pipeline_utils import configs
	from nemo.collections.diffusion.utils.mcore_parallel_utils import Utils


	def parse_args():
	# pylint: disable=C0116
	parser = argparse.ArgumentParser(
	description="The flux inference pipeline is utilizing megatron core transformer.\n"
	"Please prepare the necessary checkpoints for flux model on local disk in order to use this script"
	)

	parser.add_argument("--flux_ckpt", type=str, default="", help="Path to Flux transformer checkpoint(s)")
	parser.add_argument("--vae_ckpt", type=str, default="/ckpts/ae.safetensors", help="Path to \'ae.safetensors\'")
	parser.add_argument(
	"--clip_version",
	type=str,
	default='/ckpts/text_encoder',
	help="Clip version, provide either ckpt dir or clip version like openai/clip-vit-large-patch14",
	)
	parser.add_argument(
	"--t5_version",
	type=str,
	default='/ckpts/text_encoder_2',
	help="Clip version, provide either ckpt dir or clip version like google/t5-v1_1-xxl",
	)
	parser.add_argument(
	"--t5_load_config_only",
	action='store_true',
	default=False,
	help="randomly initialize T5 weights for testing purpose",
	)
	parser.add_argument(
	"--do_convert_from_hf",
	action='store_true',
	default=False,
	help="Must be true if provided checkpoint is not already converted to NeMo version",
	)
	parser.add_argument(
	"--save_converted_model_to",
	type=str,
	default=None,
	help="Whether to save the converted NeMo transformer checkpoint for Flux",
	)
	parser.add_argument(
	"--version",
	type=str,
	default='dev',
	help="Must align with the checkpoint provided.",
	)
	parser.add_argument("--height", type=int, default=1024, help="Image height.")
	parser.add_argument("--width", type=int, default=1024, help="Image width.")
	parser.add_argument("--inference_steps", type=int, default=10, help="Number of inference steps to run.")
	parser.add_argument(
	"--num_images_per_prompt", type=int, default=1, help="Number of images to generate for each prompt."
	)
	parser.add_argument(
	"--num_joint_layers", type=int, default=19, help="Number of joint transformer layers in controlnet."
	)
	parser.add_argument(
	"--num_single_layers", type=int, default=38, help="Number of single transformer layers in controlnet."
	)
	parser.add_argument("--guidance", type=float, default=0.0, help="Guidance scale.")
	parser.add_argument(
	"--offload", action='store_true', default=False, help="Offload modules to cpu after being called."
	)
	parser.add_argument(
	"--prompts",
	type=str,
	default="A cat holding a sign that says hello world",
	help="Inference prompts, use \',\' to separate if multiple prompts are provided.",
	)
	parser.add_argument("--output_path", type=str, default="/tmp/flux_output", help="Path to save inference output.")
	args = parser.parse_args()
	return args


	if __name__ == '__main__':
	args = parse_args()
	print('Initializing model parallel config')
	Utils.initialize_distributed(1, 1, 1)

	print('Initializing flux inference pipeline')
	params = configs[args.version]
	params.vae_config.ckpt = args.vae_ckpt if os.path.exists(args.vae_ckpt) else None
	params.clip_params.version = (
	args.clip_version if os.path.exists(args.clip_version) else "openai/clip-vit-large-patch14"
	)
	params.t5_params.version = args.t5_version if os.path.exists(args.t5_version) else "google/t5-v1_1-xxl"
	params.t5_params.load_config_only = args.t5_load_config_only

	params.flux_config.num_joint_layers = args.num_joint_layers
	params.flux_config.num_single_layers = args.num_single_layers
	pipe = FluxInferencePipeline(params)

	if os.path.exists(args.flux_ckpt):
	print('Loading transformer weights')
	pipe.load_from_pretrained(
	args.flux_ckpt,
	do_convert_from_hf=args.do_convert_from_hf,
	save_converted_model_to=args.save_converted_model_to,
	)
	dtype = torch.float32
	text = args.prompts.split(',')
	pipe(
	text,
	max_sequence_length=512,
	height=args.height,
	width=args.width,
	num_inference_steps=args.inference_steps,
	num_images_per_prompt=args.num_images_per_prompt,
	offload=args.offload,
	guidance_scale=args.guidance,
	dtype=dtype,
	output_path=args.output_path,
	)