| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| import argparse |
| import json |
| import logging |
| import os |
| import sys |
| from pathlib import Path |
| from typing import Optional |
|
|
| import uvicorn |
| from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer |
|
|
| from nemo.deploy import DeployPyTriton |
|
|
| LOGGER = logging.getLogger("NeMo") |
|
|
|
|
| class UsageError(Exception): |
| pass |
|
|
|
|
| megatron_llm_supported = True |
| try: |
| from nemo.deploy.nlp.megatronllm_deployable import MegatronLLMDeployable |
| except Exception as e: |
| LOGGER.warning(f"Cannot import MegatronLLMDeployable, it will not be available. {type(e).__name__}: {e}") |
| megatron_llm_supported = False |
|
|
| trt_llm_supported = True |
| try: |
| from nemo.export.tensorrt_llm import TensorRTLLM |
| except Exception as e: |
| LOGGER.warning(f"Cannot import the TensorRTLLM exporter, it will not be available. {type(e).__name__}: {e}") |
| trt_llm_supported = False |
|
|
|
|
| def get_args(argv): |
| parser = argparse.ArgumentParser( |
| formatter_class=argparse.ArgumentDefaultsHelpFormatter, |
| description=f"Deploy nemo models to Triton", |
| ) |
| parser.add_argument("-nc", "--nemo_checkpoint", type=str, help="Source .nemo file") |
| parser.add_argument("-hfp", "--hf_model_id_path", type=str, help="Huggingface model path or id") |
| parser.add_argument( |
| "-ptnc", |
| "--ptuning_nemo_checkpoint", |
| nargs='+', |
| type=str, |
| required=False, |
| help="Source .nemo file for prompt embeddings table", |
| ) |
| parser.add_argument( |
| '-ti', '--task_ids', nargs='+', type=str, required=False, help='Unique task names for the prompt embedding.' |
| ) |
| parser.add_argument( |
| "-mt", |
| "--model_type", |
| type=str, |
| required=False, |
| help="Type of the model. gptnext, gpt, llama, falcon, and starcoder are only supported." |
| " gptnext and gpt are the same and keeping it for backward compatibility", |
| ) |
| parser.add_argument("-tmn", "--triton_model_name", required=True, type=str, help="Name for the service") |
| parser.add_argument("-tmv", "--triton_model_version", default=1, type=int, help="Version for the service") |
| parser.add_argument( |
| "-trp", "--triton_port", default=8000, type=int, help="Port for the Triton server to listen for requests" |
| ) |
| parser.add_argument( |
| "-tha", "--triton_http_address", default="0.0.0.0", type=str, help="HTTP address for the Triton server" |
| ) |
| parser.add_argument( |
| "-trt", "--triton_request_timeout", default=60, type=int, help="Timeout in seconds for Triton server" |
| ) |
| parser.add_argument( |
| "-tmr", "--triton_model_repository", default=None, type=str, help="Folder for the trt-llm conversion" |
| ) |
| parser.add_argument("-ng", "--num_gpus", default=None, type=int, help="Number of GPUs for the deployment") |
| parser.add_argument("-tps", "--tensor_parallelism_size", default=1, type=int, help="Tensor parallelism size") |
| parser.add_argument("-pps", "--pipeline_parallelism_size", default=1, type=int, help="Pipeline parallelism size") |
| parser.add_argument( |
| "-dt", |
| "--dtype", |
| choices=["bfloat16", "float16", "fp8", "int8"], |
| default="bfloat16", |
| type=str, |
| help="dtype of the model on TensorRT-LLM", |
| ) |
| parser.add_argument("-mil", "--max_input_len", default=256, type=int, help="Max input length of the model") |
| parser.add_argument("-mol", "--max_output_len", default=256, type=int, help="Max output length of the model") |
| parser.add_argument("-mbs", "--max_batch_size", default=8, type=int, help="Max batch size of the model") |
| parser.add_argument("-mnt", "--max_num_tokens", default=None, type=int, help="Max number of tokens") |
| parser.add_argument("-msl", "--max_seq_len", default=None, type=int, help="Maximum number of sequence length") |
| parser.add_argument("-mp", "--multiple_profiles", default=False, action='store_true', help="Multiple profiles") |
| parser.add_argument("-ont", "--opt_num_tokens", default=None, type=int, help="Optimum number of tokens") |
| parser.add_argument( |
| "-gap", "--gpt_attention_plugin", default="auto", type=str, help="dtype of gpt attention plugin" |
| ) |
| parser.add_argument("-gp", "--gemm_plugin", default="auto", type=str, help="dtype of gpt plugin") |
| parser.add_argument( |
| "-mpet", "--max_prompt_embedding_table_size", default=None, type=int, help="Max prompt embedding table size" |
| ) |
| parser.add_argument( |
| "-npkc", "--no_paged_kv_cache", default=False, action='store_true', help="Enable paged kv cache." |
| ) |
| parser.add_argument( |
| "-drip", |
| "--disable_remove_input_padding", |
| default=False, |
| action='store_true', |
| help="Disables the remove input padding option.", |
| ) |
| parser.add_argument( |
| "-upe", |
| "--use_parallel_embedding", |
| default=False, |
| action='store_true', |
| help='Use parallel embedding feature of TensorRT-LLM.', |
| ) |
| parser.add_argument( |
| "-mbm", |
| '--multi_block_mode', |
| default=False, |
| action='store_true', |
| help='Split long kv sequence into multiple blocks (applied to generation MHA kernels). \ |
| It is beneifical when batchxnum_heads cannot fully utilize GPU. \ |
| Only available when using c++ runtime.', |
| ) |
| parser.add_argument( |
| "-es", '--enable_streaming', default=False, action='store_true', help="Enables streaming sentences." |
| ) |
| parser.add_argument( |
| '--use_lora_plugin', |
| nargs='?', |
| const=None, |
| choices=['float16', 'float32', 'bfloat16'], |
| help="Activates the lora plugin which enables embedding sharing.", |
| ) |
| parser.add_argument( |
| '--lora_target_modules', |
| nargs='+', |
| default=None, |
| choices=[ |
| "attn_qkv", |
| "attn_q", |
| "attn_k", |
| "attn_v", |
| "attn_dense", |
| "mlp_h_to_4h", |
| "mlp_gate", |
| "mlp_4h_to_h", |
| ], |
| help="Add lora in which modules. Only be activated when use_lora_plugin is enabled.", |
| ) |
| parser.add_argument( |
| '--max_lora_rank', |
| type=int, |
| default=64, |
| help='maximum lora rank for different lora modules. ' |
| 'It is used to compute the workspace size of lora plugin.', |
| ) |
| parser.add_argument( |
| "-lc", "--lora_ckpt", default=None, type=str, nargs="+", help="The checkpoint list of LoRA weights" |
| ) |
| parser.add_argument( |
| "-ucr", |
| '--use_cpp_runtime', |
| default=False, |
| action='store_true', |
| help='Use TensorRT LLM C++ runtime', |
| ) |
| parser.add_argument( |
| "-b", |
| '--backend', |
| nargs='?', |
| const=None, |
| default='TensorRT-LLM', |
| choices=['TensorRT-LLM', 'In-Framework'], |
| help="Different options to deploy nemo model.", |
| ) |
| parser.add_argument( |
| "-srs", |
| "--start_rest_service", |
| default=False, |
| type=bool, |
| help="Starts the REST service for OpenAI API support", |
| ) |
| parser.add_argument( |
| "-sha", "--service_http_address", default="0.0.0.0", type=str, help="HTTP address for the REST Service" |
| ) |
| parser.add_argument("-sp", "--service_port", default=8080, type=int, help="Port for the REST Service") |
| parser.add_argument( |
| "-ofr", |
| "--openai_format_response", |
| default=False, |
| type=bool, |
| help="Return the response from PyTriton server in OpenAI compatible format", |
| ) |
| parser.add_argument("-dm", "--debug_mode", default=False, action='store_true', help="Enable debug mode") |
| parser.add_argument( |
| "-fp8", |
| "--export_fp8_quantized", |
| default="auto", |
| type=str, |
| help="Enables exporting to a FP8-quantized TRT LLM checkpoint", |
| ) |
| parser.add_argument( |
| "-kv_fp8", |
| "--use_fp8_kv_cache", |
| default="auto", |
| type=str, |
| help="Enables exporting with FP8-quantizatized KV-cache", |
| ) |
| args = parser.parse_args(argv) |
|
|
| def str_to_bool(name: str, s: str, optional: bool = False) -> Optional[bool]: |
| s = s.lower() |
| true_strings = ["true", "1"] |
| false_strings = ["false", "0"] |
| if s in true_strings: |
| return True |
| if s in false_strings: |
| return False |
| if optional and s == 'auto': |
| return None |
| raise UsageError(f"Invalid boolean value for argument --{name}: '{s}'") |
|
|
| args.export_fp8_quantized = str_to_bool("export_fp8_quantized", args.export_fp8_quantized, optional=True) |
| args.use_fp8_kv_cache = str_to_bool("use_fp8_kv_cache", args.use_fp8_kv_cache, optional=True) |
| return args |
|
|
|
|
| def store_args_to_json(args): |
| """ |
| Stores user defined arg values relevant for REST API in config.json |
| Gets called only when args.start_rest_service is True. |
| """ |
| args_dict = { |
| "triton_service_ip": args.triton_http_address, |
| "triton_service_port": args.triton_port, |
| "triton_request_timeout": args.triton_request_timeout, |
| "openai_format_response": args.openai_format_response, |
| } |
| with open("nemo/deploy/service/config.json", "w") as f: |
| json.dump(args_dict, f) |
|
|
|
|
| def get_trtllm_deployable(args): |
| if args.triton_model_repository is None: |
| trt_llm_path = "/tmp/trt_llm_model_dir/" |
| LOGGER.info( |
| "/tmp/trt_llm_model_dir/ path will be used as the TensorRT LLM folder. " |
| "Please set the --triton_model_repository parameter if you'd like to use a path that already " |
| "includes the TensorRT LLM model files." |
| ) |
| Path(trt_llm_path).mkdir(parents=True, exist_ok=True) |
| else: |
| trt_llm_path = args.triton_model_repository |
|
|
| if args.hf_model_id_path: |
| |
| LOGGER.info(f"Checking if the model is available in the local cache: {args.hf_model_id_path}") |
| local_path = Path(args.hf_model_id_path) |
| model_available = local_path.exists() and (local_path / "config.json").exists() |
| if not model_available: |
| |
| |
| LOGGER.info(f"Downloading model from HuggingFace: {args.hf_model_id_path}") |
| try: |
| hf_model_cache_dir = "/tmp/hf_model_dir/" |
| Path(hf_model_cache_dir).mkdir(parents=True, exist_ok=True) |
| |
| hf_model_path = os.path.join(hf_model_cache_dir, args.hf_model_id_path) |
| Path(hf_model_path).mkdir(parents=True, exist_ok=True) |
|
|
| |
| model = AutoModelForCausalLM.from_pretrained( |
| args.hf_model_id_path, cache_dir=hf_model_path, torch_dtype="auto", use_safetensors=True |
| ) |
| |
| tokenizer = AutoTokenizer.from_pretrained(args.hf_model_id_path, cache_dir=hf_model_path) |
| config = AutoConfig.from_pretrained(args.hf_model_id_path, cache_dir=hf_model_path) |
|
|
| |
| model.save_pretrained(hf_model_path, safe_serialization=True) |
|
|
| |
| tokenizer.save_pretrained(hf_model_path) |
| config.save_pretrained(hf_model_path) |
| args.hf_model_id_path = hf_model_path |
|
|
| LOGGER.info(f"Downloaded model, tokenizer and config to {args.hf_model_id_path}") |
| except Exception as e: |
| raise RuntimeError(f"Error downloading from HuggingFace: {str(e)}") |
|
|
| checkpoint_missing = args.nemo_checkpoint is None and args.hf_model_id_path is None |
| if checkpoint_missing and args.triton_model_repository is None: |
| raise ValueError( |
| "The provided model repository is not a valid TensorRT-LLM model " |
| "directory. Please provide a --nemo_checkpoint." |
| ) |
|
|
| if checkpoint_missing and not os.path.isdir(args.triton_model_repository): |
| raise ValueError( |
| "The provided model repository is not a valid TensorRT-LLM model " |
| "directory. Please provide a --nemo_checkpoint." |
| ) |
|
|
| if not checkpoint_missing and args.model_type is None: |
| raise ValueError("Model type is required to be defined if a nemo checkpoint is provided.") |
|
|
| ptuning_tables_files = [] |
| if not args.ptuning_nemo_checkpoint is None: |
| if args.max_prompt_embedding_table_size is None: |
| raise ValueError("max_prompt_embedding_table_size parameter is needed for the prompt tuning table(s).") |
|
|
| for pt_checkpoint in args.ptuning_nemo_checkpoint: |
| ptuning_nemo_checkpoint_path = Path(pt_checkpoint) |
| if ptuning_nemo_checkpoint_path.exists(): |
| if ptuning_nemo_checkpoint_path.is_file(): |
| ptuning_tables_files.append(pt_checkpoint) |
| else: |
| raise IsADirectoryError("Could not read the prompt tuning tables from {0}".format(pt_checkpoint)) |
| else: |
| raise FileNotFoundError("File or directory {0} does not exist.".format(pt_checkpoint)) |
|
|
| if args.task_ids is not None: |
| if len(ptuning_tables_files) != len(args.task_ids): |
| raise RuntimeError( |
| "Number of task ids and prompt embedding tables have to match. " |
| "There are {0} tables and {1} task ids.".format(len(ptuning_tables_files), len(args.task_ids)) |
| ) |
|
|
| trt_llm_exporter = TensorRTLLM( |
| model_dir=trt_llm_path, |
| lora_ckpt_list=args.lora_ckpt, |
| load_model=(args.nemo_checkpoint is None and args.hf_model_id_path is None), |
| use_python_runtime=(not args.use_cpp_runtime), |
| multi_block_mode=args.multi_block_mode, |
| ) |
|
|
| if args.nemo_checkpoint is not None: |
| try: |
| LOGGER.info("Export operation will be started to export the nemo checkpoint to TensorRT-LLM.") |
| trt_llm_exporter.export( |
| nemo_checkpoint_path=args.nemo_checkpoint, |
| model_type=args.model_type, |
| tensor_parallelism_size=args.tensor_parallelism_size, |
| pipeline_parallelism_size=args.pipeline_parallelism_size, |
| max_input_len=args.max_input_len, |
| max_output_len=args.max_output_len, |
| max_batch_size=args.max_batch_size, |
| max_num_tokens=args.max_num_tokens, |
| opt_num_tokens=args.opt_num_tokens, |
| max_seq_len=args.max_seq_len, |
| use_parallel_embedding=args.use_parallel_embedding, |
| max_prompt_embedding_table_size=args.max_prompt_embedding_table_size, |
| paged_kv_cache=(not args.no_paged_kv_cache), |
| remove_input_padding=(not args.disable_remove_input_padding), |
| dtype=args.dtype, |
| use_lora_plugin=args.use_lora_plugin, |
| lora_target_modules=args.lora_target_modules, |
| max_lora_rank=args.max_lora_rank, |
| multiple_profiles=args.multiple_profiles, |
| gpt_attention_plugin=args.gpt_attention_plugin, |
| gemm_plugin=args.gemm_plugin, |
| fp8_quantized=args.export_fp8_quantized, |
| fp8_kvcache=args.use_fp8_kv_cache, |
| ) |
| except Exception as error: |
| raise RuntimeError("An error has occurred during the model export. Error message: " + str(error)) |
| elif args.hf_model_id_path is not None: |
| LOGGER.info("Export operation will be started to export the hugging face checkpoint to TensorRT-LLM.") |
| try: |
| trt_llm_exporter.export_hf_model( |
| hf_model_path=args.hf_model_id_path, |
| max_batch_size=args.max_batch_size, |
| tensor_parallelism_size=args.tensor_parallelism_size, |
| max_input_len=args.max_input_len, |
| max_output_len=args.max_output_len, |
| dtype=args.dtype, |
| model_type=args.model_type, |
| ) |
| except Exception as error: |
| raise RuntimeError("An error has occurred during the model export. Error message: " + str(error)) |
|
|
| try: |
| for i, prompt_embeddings_checkpoint_path in enumerate(ptuning_tables_files): |
| if args.task_ids is not None: |
| task_id = args.task_ids[i] |
| else: |
| task_id = i |
|
|
| LOGGER.info( |
| "Adding prompt embedding table: {0} with task id: {1}.".format( |
| prompt_embeddings_checkpoint_path, task_id |
| ) |
| ) |
| trt_llm_exporter.add_prompt_table( |
| task_name=str(task_id), |
| prompt_embeddings_checkpoint_path=prompt_embeddings_checkpoint_path, |
| ) |
| except Exception as error: |
| raise RuntimeError( |
| "An error has occurred during adding the prompt embedding table(s). Error message: " + str(error) |
| ) |
| return trt_llm_exporter |
|
|
|
|
| def get_nemo_deployable(args): |
| if args.nemo_checkpoint is None: |
| raise ValueError("In-Framework deployment requires a .nemo checkpoint") |
|
|
| return MegatronLLMDeployable(args.nemo_checkpoint, args.num_gpus) |
|
|
|
|
| def nemo_deploy(argv): |
| args = get_args(argv) |
|
|
| if args.debug_mode: |
| loglevel = logging.DEBUG |
| else: |
| loglevel = logging.INFO |
|
|
| LOGGER.setLevel(loglevel) |
| LOGGER.info("Logging level set to {}".format(loglevel)) |
| LOGGER.info(args) |
|
|
| if args.start_rest_service: |
| if args.service_port == args.triton_port: |
| logging.error("REST service port and Triton server port cannot use the same port.") |
| return |
| |
| store_args_to_json(args) |
|
|
| backend = args.backend.lower() |
| if backend == 'tensorrt-llm': |
| if not trt_llm_supported: |
| raise ValueError("TensorRT-LLM engine is not supported in this environment.") |
| triton_deployable = get_trtllm_deployable(args) |
| elif backend == 'in-framework': |
| if not megatron_llm_supported: |
| raise ValueError("MegatronLLMDeployable is not supported in this environment.") |
| triton_deployable = get_nemo_deployable(args) |
| else: |
| raise ValueError("Backend: {0} is not supported.".format(backend)) |
|
|
| try: |
| nm = DeployPyTriton( |
| model=triton_deployable, |
| triton_model_name=args.triton_model_name, |
| triton_model_version=args.triton_model_version, |
| max_batch_size=args.max_batch_size, |
| http_port=args.triton_port, |
| address=args.triton_http_address, |
| streaming=args.enable_streaming, |
| ) |
|
|
| LOGGER.info("Triton deploy function will be called.") |
| nm.deploy() |
| nm.run() |
| except Exception as error: |
| LOGGER.error("Error message has occurred during deploy function. Error message: " + str(error)) |
| return |
|
|
| try: |
| LOGGER.info("Model serving on Triton is will be started.") |
| if args.start_rest_service: |
| try: |
| LOGGER.info("REST service will be started.") |
| uvicorn.run( |
| 'nemo.deploy.service.rest_model_api:app', |
| host=args.service_http_address, |
| port=args.service_port, |
| reload=True, |
| ) |
| except Exception as error: |
| logging.error("Error message has occurred during REST service start. Error message: " + str(error)) |
| nm.serve() |
| except Exception as error: |
| LOGGER.error("Error message has occurred during deploy function. Error message: " + str(error)) |
| return |
| LOGGER.info("Model serving will be stopped.") |
| nm.stop() |
|
|
|
|
| if __name__ == '__main__': |
| nemo_deploy(sys.argv[1:]) |
|
|