| import argparse |
| import uvicorn |
| from api import app |
|
|
|
|
| def parse_args(): |
| parser = argparse.ArgumentParser(description="Launch Flux API server") |
| parser.add_argument( |
| "-c", |
| "--config-path", |
| type=str, |
| help="Path to the configuration file, if not provided, the model will be loaded from the command line arguments", |
| ) |
| parser.add_argument( |
| "-p", |
| "--port", |
| type=int, |
| default=8088, |
| help="Port to run the server on", |
| ) |
| parser.add_argument( |
| "-H", |
| "--host", |
| type=str, |
| default="0.0.0.0", |
| help="Host to run the server on", |
| ) |
| parser.add_argument( |
| "-f", "--flow-model-path", type=str, help="Path to the flow model" |
| ) |
| parser.add_argument( |
| "-t", "--text-enc-path", type=str, help="Path to the text encoder" |
| ) |
| parser.add_argument( |
| "-a", "--autoencoder-path", type=str, help="Path to the autoencoder" |
| ) |
| parser.add_argument( |
| "-m", |
| "--model-version", |
| type=str, |
| choices=["flux-dev", "flux-schnell"], |
| default="flux-dev", |
| help="Choose model version", |
| ) |
| parser.add_argument( |
| "-F", |
| "--flux-device", |
| type=str, |
| default="cuda:0", |
| help="Device to run the flow model on", |
| ) |
| parser.add_argument( |
| "-T", |
| "--text-enc-device", |
| type=str, |
| default="cuda:0", |
| help="Device to run the text encoder on", |
| ) |
| parser.add_argument( |
| "-A", |
| "--autoencoder-device", |
| type=str, |
| default="cuda:0", |
| help="Device to run the autoencoder on", |
| ) |
| parser.add_argument( |
| "-q", |
| "--num-to-quant", |
| type=int, |
| default=20, |
| help="Number of linear layers in flow transformer (the 'unet') to quantize", |
| ) |
| parser.add_argument( |
| "-C", |
| "--compile", |
| action="store_true", |
| default=False, |
| help="Compile the flow model with extra optimizations", |
| ) |
| parser.add_argument( |
| "-qT", |
| "--quant-text-enc", |
| type=str, |
| default="qfloat8", |
| choices=["qint4", "qfloat8", "qint2", "qint8", "bf16"], |
| help="Quantize the t5 text encoder to the given dtype, if bf16, will not quantize", |
| dest="quant_text_enc", |
| ) |
| parser.add_argument( |
| "-qA", |
| "--quant-ae", |
| action="store_true", |
| default=False, |
| help="Quantize the autoencoder with float8 linear layers, otherwise will use bfloat16", |
| dest="quant_ae", |
| ) |
| parser.add_argument( |
| "-OF", |
| "--offload-flow", |
| action="store_true", |
| default=False, |
| dest="offload_flow", |
| help="Offload the flow model to the CPU when not being used to save memory", |
| ) |
| parser.add_argument( |
| "-OA", |
| "--no-offload-ae", |
| action="store_false", |
| default=True, |
| dest="offload_ae", |
| help="Disable offloading the autoencoder to the CPU when not being used to increase e2e inference speed", |
| ) |
| parser.add_argument( |
| "-OT", |
| "--no-offload-text-enc", |
| action="store_false", |
| default=True, |
| dest="offload_text_enc", |
| help="Disable offloading the text encoder to the CPU when not being used to increase e2e inference speed", |
| ) |
| parser.add_argument( |
| "-PF", |
| "--prequantized-flow", |
| action="store_true", |
| default=False, |
| dest="prequantized_flow", |
| help="Load the flow model from a prequantized checkpoint " |
| + "(requires loading the flow model, running a minimum of 24 steps, " |
| + "and then saving the state_dict as a safetensors file), " |
| + "which reduces the size of the checkpoint by about 50% & reduces startup time", |
| ) |
| parser.add_argument( |
| "-nqfm", |
| "--no-quantize-flow-modulation", |
| action="store_false", |
| default=True, |
| dest="quantize_modulation", |
| help="Disable quantization of the modulation layers in the flow model, adds ~2GB vram usage for moderate precision improvements", |
| ) |
| parser.add_argument( |
| "-qfl", |
| "--quantize-flow-embedder-layers", |
| action="store_true", |
| default=False, |
| dest="quantize_flow_embedder_layers", |
| help="Quantize the flow embedder layers in the flow model, saves ~512MB vram usage, but precision loss is very noticeable", |
| ) |
| return parser.parse_args() |
|
|
|
|
| def main(): |
| args = parse_args() |
|
|
| |
| from flux_pipeline import FluxPipeline |
| from util import load_config, ModelVersion |
|
|
| if args.config_path: |
| app.state.model = FluxPipeline.load_pipeline_from_config_path( |
| args.config_path, flow_model_path=args.flow_model_path |
| ) |
| else: |
| model_version = ( |
| ModelVersion.flux_dev |
| if args.model_version == "flux-dev" |
| else ModelVersion.flux_schnell |
| ) |
| config = load_config( |
| model_version, |
| flux_path=args.flow_model_path, |
| flux_device=args.flux_device, |
| ae_path=args.autoencoder_path, |
| ae_device=args.autoencoder_device, |
| text_enc_path=args.text_enc_path, |
| text_enc_device=args.text_enc_device, |
| flow_dtype="float16", |
| text_enc_dtype="bfloat16", |
| ae_dtype="bfloat16", |
| num_to_quant=args.num_to_quant, |
| compile_extras=args.compile, |
| compile_blocks=args.compile, |
| quant_text_enc=( |
| None if args.quant_text_enc == "bf16" else args.quant_text_enc |
| ), |
| quant_ae=args.quant_ae, |
| offload_flow=args.offload_flow, |
| offload_ae=args.offload_ae, |
| offload_text_enc=args.offload_text_enc, |
| prequantized_flow=args.prequantized_flow, |
| quantize_modulation=args.quantize_modulation, |
| quantize_flow_embedder_layers=args.quantize_flow_embedder_layers, |
| ) |
| app.state.model = FluxPipeline.load_pipeline_from_config(config) |
|
|
| uvicorn.run(app, host=args.host, port=args.port) |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|