| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
|
|
| import argparse |
| import json |
| import shutil |
| import time |
| from pathlib import Path |
|
|
| import torch |
|
|
| from nemo.deploy.nlp.megatronllm_deployable import MegatronLLMDeployable |
|
|
| run_export_tests = True |
| try: |
| from nemo.deploy import DeployPyTriton |
| from nemo.deploy.nlp import NemoQueryLLM, NemoQueryLLMPyTorch |
| from nemo.export.tensorrt_llm import TensorRTLLM |
| except Exception as e: |
| run_export_tests = False |
|
|
|
|
| def get_accuracy_with_lambada(model, nq, task_ids, lora_uids, test_data_path=None): |
| |
| |
| |
| |
|
|
| if test_data_path is None: |
| raise Exception("test_data_path cannot be None.") |
|
|
| trtllm_correct = 0 |
| trtllm_deployed_correct = 0 |
| trtllm_correct_relaxed = 0 |
| trtllm_deployed_correct_relaxed = 0 |
| all_expected_outputs = [] |
| all_trtllm_outputs = [] |
|
|
| with open(test_data_path, 'r') as file: |
| records = json.load(file) |
|
|
| eval_start = time.perf_counter() |
| for record in records: |
| prompt = record["text_before_last_word"] |
| expected_output = record["last_word"].strip().lower() |
| trtllm_output = model.forward( |
| input_texts=[prompt], |
| max_output_len=1, |
| top_k=1, |
| top_p=0, |
| temperature=0.1, |
| task_ids=task_ids, |
| lora_uids=lora_uids, |
| ) |
| trtllm_output = trtllm_output[0][0].strip().lower() |
|
|
| all_expected_outputs.append(expected_output) |
| all_trtllm_outputs.append(trtllm_output) |
|
|
| if expected_output == trtllm_output: |
| trtllm_correct += 1 |
|
|
| if ( |
| expected_output == trtllm_output |
| or trtllm_output.startswith(expected_output) |
| or expected_output.startswith(trtllm_output) |
| ): |
| if len(trtllm_output) == 1 and len(expected_output) > 1: |
| continue |
| trtllm_correct_relaxed += 1 |
|
|
| if nq is not None: |
| trtllm_deployed_output = nq.query_llm( |
| prompts=[prompt], |
| max_output_len=1, |
| top_k=1, |
| top_p=0, |
| temperature=0.1, |
| task_id=task_ids, |
| ) |
| trtllm_deployed_output = trtllm_deployed_output[0][0].strip().lower() |
|
|
| if expected_output == trtllm_deployed_output: |
| trtllm_deployed_correct += 1 |
|
|
| if ( |
| expected_output == trtllm_deployed_output |
| or trtllm_deployed_output.startswith(expected_output) |
| or expected_output.startswith(trtllm_deployed_output) |
| ): |
| if len(trtllm_deployed_output) == 1 and len(expected_output) > 1: |
| continue |
| trtllm_deployed_correct_relaxed += 1 |
| eval_end = time.perf_counter() |
|
|
| trtllm_accuracy = trtllm_correct / len(all_expected_outputs) |
| trtllm_accuracy_relaxed = trtllm_correct_relaxed / len(all_expected_outputs) |
|
|
| trtllm_deployed_accuracy = trtllm_deployed_correct / len(all_expected_outputs) |
| trtllm_deployed_accuracy_relaxed = trtllm_deployed_correct_relaxed / len(all_expected_outputs) |
|
|
| evaluation_time = eval_end - eval_start |
|
|
| return ( |
| trtllm_accuracy, |
| trtllm_accuracy_relaxed, |
| trtllm_deployed_accuracy, |
| trtllm_deployed_accuracy_relaxed, |
| evaluation_time, |
| ) |
|
|
|
|
| def run_in_framework_inference( |
| model_name, |
| prompt, |
| checkpoint_path, |
| n_gpu=1, |
| max_batch_size=None, |
| max_input_len=None, |
| max_output_len=None, |
| ): |
| model = MegatronLLMDeployable(checkpoint_path, n_gpu) |
| nm = DeployPyTriton( |
| model=model, |
| triton_model_name=model_name, |
| http_port=8000, |
| ) |
| nm.deploy() |
| nm.run() |
| nq = NemoQueryLLMPyTorch(url="localhost:8000", model_name=model_name) |
|
|
| output_deployed = nq.query_llm( |
| prompts=prompt, |
| ) |
|
|
| print("Output: ", output_deployed) |
|
|
| nm.stop() |
|
|
| return None, None, None, None, None |
|
|
|
|
| def run_trt_llm_inference( |
| model_name, |
| model_type, |
| prompt, |
| checkpoint_path, |
| trt_llm_model_dir, |
| n_gpu=1, |
| max_batch_size=8, |
| use_embedding_sharing=False, |
| max_input_len=128, |
| max_output_len=128, |
| max_num_tokens=None, |
| ptuning=False, |
| p_tuning_checkpoint=None, |
| lora=False, |
| lora_checkpoint=None, |
| tp_size=None, |
| pp_size=None, |
| top_k=1, |
| top_p=0.0, |
| temperature=1.0, |
| run_accuracy=False, |
| debug=True, |
| streaming=False, |
| stop_words_list=None, |
| test_deployment=False, |
| test_data_path=None, |
| save_engine=False, |
| ): |
| if Path(checkpoint_path).exists(): |
| if n_gpu > torch.cuda.device_count(): |
| print( |
| "Path: {0} and model: {1} with {2} gpus won't be tested since available # of gpus = {3}".format( |
| checkpoint_path, model_name, n_gpu, torch.cuda.device_count() |
| ) |
| ) |
| return None, None, None, None, None |
|
|
| Path(trt_llm_model_dir).mkdir(parents=True, exist_ok=True) |
|
|
| if debug: |
| print("") |
| print("") |
| print( |
| "################################################## NEW TEST ##################################################" |
| ) |
| print("") |
|
|
| print("Path: {0} and model: {1} with {2} gpus will be tested".format(checkpoint_path, model_name, n_gpu)) |
|
|
| prompt_embeddings_checkpoint_path = None |
| task_ids = None |
| max_prompt_embedding_table_size = 0 |
|
|
| if ptuning: |
| if Path(p_tuning_checkpoint).exists(): |
| prompt_embeddings_checkpoint_path = p_tuning_checkpoint |
| max_prompt_embedding_table_size = 8192 |
| task_ids = ["0"] |
| if debug: |
| print("---- PTuning enabled.") |
| else: |
| print("---- PTuning could not be enabled and skipping the test.") |
| return None, None, None, None, None |
|
|
| lora_ckpt_list = None |
| lora_uids = None |
| use_lora_plugin = None |
| lora_target_modules = None |
|
|
| if lora: |
| if Path(lora_checkpoint).exists(): |
| lora_ckpt_list = [lora_checkpoint] |
| lora_uids = ["0", "-1", "0"] |
| use_lora_plugin = "bfloat16" |
| lora_target_modules = ["attn_qkv"] |
| if debug: |
| print("---- LoRA enabled.") |
| else: |
| print("---- LoRA could not be enabled and skipping the test.") |
| return None, None, None, None, None |
|
|
| trt_llm_exporter = TensorRTLLM(trt_llm_model_dir, lora_ckpt_list, load_model=False) |
|
|
| trt_llm_exporter.export( |
| nemo_checkpoint_path=checkpoint_path, |
| model_type=model_type, |
| tensor_parallelism_size=tp_size, |
| pipeline_parallelism_size=pp_size, |
| max_input_len=max_input_len, |
| max_output_len=max_output_len, |
| max_batch_size=max_batch_size, |
| max_prompt_embedding_table_size=max_prompt_embedding_table_size, |
| use_lora_plugin=use_lora_plugin, |
| lora_target_modules=lora_target_modules, |
| max_num_tokens=max_num_tokens, |
| opt_num_tokens=60, |
| use_embedding_sharing=use_embedding_sharing, |
| ) |
|
|
| if ptuning: |
| trt_llm_exporter.add_prompt_table( |
| task_name="0", |
| prompt_embeddings_checkpoint_path=prompt_embeddings_checkpoint_path, |
| ) |
|
|
| output = trt_llm_exporter.forward( |
| input_texts=prompt, |
| max_output_len=max_output_len, |
| top_k=top_k, |
| top_p=top_p, |
| temperature=temperature, |
| task_ids=task_ids, |
| lora_uids=lora_uids, |
| streaming=streaming, |
| stop_words_list=stop_words_list, |
| ) |
|
|
| if not use_lora_plugin and not ptuning: |
| test_cpp_runtime( |
| engine_path=trt_llm_model_dir, |
| prompt=prompt, |
| max_output_len=max_output_len, |
| debug=True, |
| ) |
|
|
| nq = None |
| nm = None |
| output_deployed = "" |
| if test_deployment: |
| nm = DeployPyTriton( |
| model=trt_llm_exporter, |
| triton_model_name=model_name, |
| http_port=8000, |
| ) |
| nm.deploy() |
| nm.run() |
| nq = NemoQueryLLM(url="localhost:8000", model_name=model_name) |
|
|
| output_deployed = nq.query_llm( |
| prompts=prompt, |
| max_output_len=max_output_len, |
| top_k=1, |
| top_p=0.0, |
| temperature=1.0, |
| lora_uids=lora_uids, |
| ) |
|
|
| if debug: |
| print("") |
| print("--- Prompt: ", prompt) |
| print("") |
| print("--- Output: ", output) |
| print("") |
| print("") |
| print("--- Output deployed: ", output_deployed) |
| print("") |
|
|
| if run_accuracy: |
| print("Start model accuracy testing ...") |
| result = get_accuracy_with_lambada(trt_llm_exporter, nq, task_ids, lora_uids, test_data_path) |
| if test_deployment: |
| nm.stop() |
|
|
| if not save_engine: |
| shutil.rmtree(trt_llm_model_dir) |
| return result |
|
|
| if test_deployment: |
| nm.stop() |
|
|
| if not save_engine: |
| shutil.rmtree(trt_llm_model_dir) |
|
|
| return None, None, None, None, None |
| else: |
| raise Exception("Checkpoint {0} could not be found.".format(checkpoint_path)) |
|
|
|
|
| def test_cpp_runtime( |
| engine_path, |
| prompt, |
| max_output_len, |
| debug, |
| ): |
| trt_llm_exporter = TensorRTLLM(engine_path, load_model=True) |
| output = trt_llm_exporter.forward( |
| input_texts=prompt, |
| max_output_len=max_output_len, |
| top_k=1, |
| top_p=0.0, |
| temperature=1.0, |
| ) |
|
|
| if debug: |
| print("") |
| print("--- Output deployed with cpp runtime: ", output) |
| print("") |
|
|
|
|
| def get_args(): |
| parser = argparse.ArgumentParser( |
| formatter_class=argparse.ArgumentDefaultsHelpFormatter, |
| description=f"Deploy nemo models to Triton and benchmark the models", |
| ) |
| parser.add_argument( |
| "--model_name", |
| type=str, |
| required=True, |
| ) |
| parser.add_argument( |
| "--model_type", |
| type=str, |
| required=False, |
| ) |
| parser.add_argument( |
| "--min_gpus", |
| type=int, |
| default=1, |
| ) |
| parser.add_argument( |
| "--max_gpus", |
| type=int, |
| ) |
| parser.add_argument( |
| "--checkpoint_dir", |
| type=str, |
| default="/tmp/nemo_checkpoint/", |
| required=False, |
| ) |
| parser.add_argument( |
| "--trt_llm_model_dir", |
| type=str, |
| ) |
| parser.add_argument( |
| "--max_batch_size", |
| type=int, |
| default=8, |
| ) |
| parser.add_argument( |
| "--max_input_len", |
| type=int, |
| default=256, |
| ) |
| parser.add_argument( |
| "--max_output_len", |
| type=int, |
| default=128, |
| ) |
| parser.add_argument( |
| "--max_num_tokens", |
| type=int, |
| ) |
| parser.add_argument( |
| "--p_tuning_checkpoint", |
| type=str, |
| ) |
| parser.add_argument( |
| "--ptuning", |
| default=False, |
| action='store_true', |
| ) |
| parser.add_argument( |
| "--lora_checkpoint", |
| type=str, |
| ) |
| parser.add_argument( |
| "--lora", |
| default=False, |
| action='store_true', |
| ) |
| parser.add_argument( |
| "--tp_size", |
| type=int, |
| default=1, |
| ) |
| parser.add_argument( |
| "--pp_size", |
| type=int, |
| default=1, |
| ) |
| parser.add_argument( |
| "--top_k", |
| type=int, |
| default=1, |
| ) |
| parser.add_argument( |
| "--top_p", |
| type=float, |
| default=0.0, |
| ) |
| parser.add_argument( |
| "--temperature", |
| type=float, |
| default=1.0, |
| ) |
| parser.add_argument( |
| "--run_accuracy", |
| type=str, |
| default="False", |
| ) |
| parser.add_argument("--streaming", default=False, action="store_true") |
| parser.add_argument( |
| "--test_deployment", |
| type=str, |
| default="False", |
| ) |
| parser.add_argument( |
| "--debug", |
| default=False, |
| action='store_true', |
| ) |
| parser.add_argument( |
| "--ci_upload_test_results_to_cloud", |
| default=False, |
| action='store_true', |
| ) |
| parser.add_argument( |
| "--test_data_path", |
| type=str, |
| default=None, |
| ) |
| parser.add_argument( |
| "-b", |
| '--backend', |
| nargs='?', |
| const=None, |
| default='TensorRT-LLM', |
| choices=['TensorRT-LLM', 'vLLM', 'In-Framework'], |
| help="Different options to deploy nemo model.", |
| ) |
| parser.add_argument( |
| "--save_engine", |
| type=str, |
| default="False", |
| ) |
|
|
| return parser.parse_args() |
|
|
|
|
| def run_inference_tests(args): |
| if args.test_deployment == "True": |
| args.test_deployment = True |
| else: |
| args.test_deployment = False |
|
|
| if args.save_engine == "True": |
| args.save_engine = True |
| else: |
| args.save_engine = False |
|
|
| if args.run_accuracy == "True": |
| args.run_accuracy = True |
| else: |
| args.run_accuracy = False |
|
|
| if args.run_accuracy: |
| if args.test_data_path is None: |
| raise Exception("test_data_path param cannot be None.") |
|
|
| result_dic = {} |
|
|
| prompt_template = ["The capital of France is", "Largest animal in the sea is"] |
| n_gpus = args.min_gpus |
| if args.max_gpus is None: |
| args.max_gpus = args.min_gpus |
|
|
| while n_gpus <= args.max_gpus: |
| if args.backend.lower() == "tensorrt-llm": |
| result_dic[n_gpus] = run_trt_llm_inference( |
| model_name=args.model_name, |
| model_type=args.model_type, |
| prompt=prompt_template, |
| checkpoint_path=args.checkpoint_dir, |
| trt_llm_model_dir=args.trt_llm_model_dir, |
| n_gpu=n_gpus, |
| max_batch_size=args.max_batch_size, |
| max_input_len=args.max_input_len, |
| max_output_len=args.max_output_len, |
| max_num_tokens=args.max_num_tokens, |
| ptuning=args.ptuning, |
| p_tuning_checkpoint=args.p_tuning_checkpoint, |
| lora=args.lora, |
| lora_checkpoint=args.lora_checkpoint, |
| tp_size=args.tp_size, |
| pp_size=args.pp_size, |
| top_k=args.top_k, |
| top_p=args.top_p, |
| temperature=args.temperature, |
| run_accuracy=args.run_accuracy, |
| debug=args.debug, |
| streaming=args.streaming, |
| test_deployment=args.test_deployment, |
| test_data_path=args.test_data_path, |
| save_engine=args.save_engine, |
| ) |
| else: |
| result_dic[n_gpus] = run_in_framework_inference( |
| model_name=args.model_name, |
| prompt=prompt_template, |
| checkpoint_path=args.checkpoint_dir, |
| n_gpu=n_gpus, |
| max_batch_size=args.max_batch_size, |
| max_input_len=args.max_input_len, |
| max_output_len=args.max_output_len, |
| ) |
|
|
| n_gpus = n_gpus * 2 |
|
|
| test_result = "PASS" |
| print_separator = False |
| print("============= Test Summary ============") |
| for i, results in result_dic.items(): |
| if not results[0] is None and not results[1] is None: |
| if print_separator: |
| print("---------------------------------------") |
| print( |
| "Number of GPUS: {}\n" |
| "Model Accuracy: {:.4f}\n" |
| "Relaxed Model Accuracy: {:.4f}\n" |
| "Deployed Model Accuracy: {:.4f}\n" |
| "Deployed Relaxed Model Accuracy: {:.4f}\n" |
| "Evaluation Time [s]: {:.2f}".format(i, *results) |
| ) |
| print_separator = True |
| if results[1] < 0.5: |
| test_result = "FAIL" |
|
|
| print("=======================================") |
| print("TEST: " + test_result) |
| if test_result == "FAIL": |
| raise Exception("Model accuracy is below 0.5") |
|
|
|
|
| if __name__ == '__main__': |
| args = get_args() |
| run_inference_tests(args) |
|
|