| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| import argparse |
| import json |
| import logging |
| import shutil |
| import time |
| from dataclasses import dataclass |
| from pathlib import Path |
| from typing import Dict, List, Optional, Tuple |
|
|
| import torch |
|
|
| LOGGER = logging.getLogger("NeMo") |
|
|
| triton_supported = True |
| try: |
| from nemo.deploy import DeployPyTriton |
| from nemo.deploy.nlp import NemoQueryLLM |
| except Exception as e: |
| LOGGER.warning(f"Cannot import Triton, deployment will not be available. {type(e).__name__}: {e}") |
| triton_supported = False |
|
|
| in_framework_supported = True |
| try: |
| from megatron.core.inference.common_inference_params import CommonInferenceParams |
|
|
| from nemo.deploy.nlp import NemoQueryLLMPyTorch |
| from nemo.deploy.nlp.megatronllm_deployable import MegatronLLMDeploy, MegatronLLMDeployableNemo2 |
| except Exception as e: |
| LOGGER.warning( |
| "Cannot import MegatronLLMDeploy* classes, or NemoQueryLLMPyTorch, or CommonInferenceParams, " |
| f"in-framework inference will not be available. Reason: {type(e).__name__}: {e}" |
| ) |
| in_framework_supported = False |
|
|
| trt_llm_supported = True |
| try: |
| from nemo.export.tensorrt_llm import TensorRTLLM |
| except Exception as e: |
| LOGGER.warning(f"Cannot import the TensorRTLLM exporter, it will not be available. {type(e).__name__}: {e}") |
| trt_llm_supported = False |
|
|
| vllm_supported = True |
| try: |
| from nemo.export.vllm_exporter import vLLMExporter |
| except Exception as e: |
| LOGGER.warning(f"Cannot import the vLLM exporter, it will not be available. {type(e).__name__}: {e}") |
| vllm_supported = False |
|
|
|
|
| class UsageError(Exception): |
| pass |
|
|
|
|
| @dataclass |
| class FunctionalResult: |
| regular_pass: Optional[bool] = None |
| deployed_pass: Optional[bool] = None |
|
|
|
|
| @dataclass |
| class AccuracyResult: |
| accuracy: float |
| accuracy_relaxed: float |
| deployed_accuracy: float |
| deployed_accuracy_relaxed: float |
| evaluation_time: float |
|
|
|
|
| def get_accuracy_with_lambada(model, nq, task_ids, lora_uids, test_data_path): |
| |
| |
| |
| |
|
|
| correct_answers = 0 |
| correct_answers_deployed = 0 |
| correct_answers_relaxed = 0 |
| correct_answers_deployed_relaxed = 0 |
| all_expected_outputs = [] |
| all_actual_outputs = [] |
|
|
| with open(test_data_path, 'r') as file: |
| records = json.load(file) |
|
|
| eval_start = time.monotonic() |
| for record in records: |
| prompt = record["text_before_last_word"] |
| expected_output = record["last_word"].strip().lower() |
| all_expected_outputs.append(expected_output) |
| if model is not None: |
| if in_framework_supported and isinstance(model, MegatronLLMDeployableNemo2): |
| model_output = model.generate( |
| prompts=[prompt], |
| inference_params=CommonInferenceParams( |
| temperature=0.1, |
| top_k=1, |
| top_p=0.0, |
| num_tokens_to_generate=1, |
| return_log_probs=False, |
| ), |
| ) |
| model_output = model_output[0].generated_text |
| else: |
| model_output = model.forward( |
| input_texts=[prompt], |
| max_output_len=1, |
| top_k=1, |
| top_p=0.0, |
| temperature=0.1, |
| task_ids=task_ids, |
| lora_uids=lora_uids, |
| ) |
| model_output = model_output[0][0].strip().lower() |
| all_actual_outputs.append(model_output) |
|
|
| if expected_output == model_output: |
| correct_answers += 1 |
|
|
| if ( |
| expected_output == model_output |
| or model_output.startswith(expected_output) |
| or expected_output.startswith(model_output) |
| ): |
| if len(model_output) == 1 and len(expected_output) > 1: |
| continue |
| correct_answers_relaxed += 1 |
|
|
| if nq is not None: |
| if in_framework_supported and isinstance(nq, NemoQueryLLMPyTorch): |
| deployed_output = nq.query_llm( |
| prompts=[prompt], |
| max_length=1, |
| top_k=1, |
| top_p=0.0, |
| temperature=0.1, |
| ) |
| |
| |
| deployed_output = deployed_output["choices"][0]["text"][0][0][0:].strip().lower() |
| else: |
| deployed_output = nq.query_llm( |
| prompts=[prompt], |
| max_output_len=1, |
| top_k=1, |
| top_p=0.0, |
| temperature=0.1, |
| task_id=task_ids, |
| ) |
| deployed_output = deployed_output[0][0].strip().lower() |
|
|
| if expected_output == deployed_output: |
| correct_answers_deployed += 1 |
|
|
| if ( |
| expected_output == deployed_output |
| or deployed_output.startswith(expected_output) |
| or expected_output.startswith(deployed_output) |
| ): |
| if len(deployed_output) == 1 and len(expected_output) > 1: |
| continue |
| correct_answers_deployed_relaxed += 1 |
| eval_end = time.monotonic() |
|
|
| return AccuracyResult( |
| accuracy=correct_answers / len(all_expected_outputs), |
| accuracy_relaxed=correct_answers_relaxed / len(all_expected_outputs), |
| deployed_accuracy=correct_answers_deployed / len(all_expected_outputs), |
| deployed_accuracy_relaxed=correct_answers_deployed_relaxed / len(all_expected_outputs), |
| evaluation_time=eval_end - eval_start, |
| ) |
|
|
|
|
| |
| def check_model_outputs(streaming: bool, model_outputs, expected_outputs: List[str]) -> bool: |
|
|
| |
| if streaming: |
| if len(model_outputs) == 0: |
| return False |
| model_outputs = model_outputs[-1] |
|
|
| |
| if len(model_outputs) != len(expected_outputs): |
| return False |
|
|
| |
| for i in range(len(model_outputs)): |
| if expected_outputs[i] not in model_outputs[i][0]: |
| return False |
|
|
| return True |
|
|
|
|
| def run_inference( |
| model_name, |
| model_type, |
| prompts, |
| expected_outputs, |
| checkpoint_path, |
| model_dir, |
| use_vllm, |
| use_huggingface, |
| max_batch_size=8, |
| use_embedding_sharing=False, |
| max_input_len=128, |
| max_output_len=128, |
| max_num_tokens=None, |
| use_parallel_embedding=False, |
| ptuning=False, |
| p_tuning_checkpoint=None, |
| lora=False, |
| lora_checkpoint=None, |
| tp_size=1, |
| pp_size=1, |
| top_k=1, |
| top_p=0.0, |
| temperature=1.0, |
| run_accuracy=False, |
| debug=True, |
| streaming=False, |
| stop_words_list=None, |
| test_cpp_runtime=False, |
| test_deployment=False, |
| test_data_path=None, |
| save_engine=False, |
| fp8_quantized=False, |
| fp8_kvcache=False, |
| trt_llm_export_kwargs=None, |
| vllm_export_kwargs=None, |
| ) -> Tuple[Optional[FunctionalResult], Optional[AccuracyResult]]: |
| if trt_llm_export_kwargs is None: |
| trt_llm_export_kwargs = {} |
|
|
| if vllm_export_kwargs is None: |
| vllm_export_kwargs = {} |
|
|
| if Path(checkpoint_path).exists(): |
| if tp_size > torch.cuda.device_count(): |
| print( |
| "Path: {0} and model: {1} with {2} tps won't be tested since available # of gpus = {3}".format( |
| checkpoint_path, model_name, tp_size, torch.cuda.device_count() |
| ) |
| ) |
| return (None, None) |
|
|
| Path(model_dir).mkdir(parents=True, exist_ok=True) |
|
|
| if debug: |
| print("") |
| print("") |
| print( |
| "################################################## NEW TEST ##################################################" |
| ) |
| print("") |
|
|
| print("Path: {0} and model: {1} with {2} tps will be tested".format(checkpoint_path, model_name, tp_size)) |
|
|
| prompt_embeddings_checkpoint_path = None |
| task_ids = None |
| max_prompt_embedding_table_size = 0 |
|
|
| if ptuning: |
| if Path(p_tuning_checkpoint).exists(): |
| prompt_embeddings_checkpoint_path = p_tuning_checkpoint |
| max_prompt_embedding_table_size = 8192 |
| task_ids = ["0"] |
| if debug: |
| print("---- PTuning enabled.") |
| else: |
| print("---- PTuning could not be enabled and skipping the test.") |
| return (None, None) |
|
|
| lora_ckpt_list = None |
| lora_uids = None |
| use_lora_plugin = None |
| lora_target_modules = None |
|
|
| if lora: |
| if Path(lora_checkpoint).exists(): |
| lora_ckpt_list = [lora_checkpoint] |
| lora_uids = ["0", "-1", "0"] |
| use_lora_plugin = "bfloat16" |
| lora_target_modules = ["attn_qkv"] |
| if debug: |
| print("---- LoRA enabled.") |
| else: |
| print("---- LoRA could not be enabled and skipping the test.") |
| return (None, None) |
|
|
| if use_vllm: |
| exporter = vLLMExporter() |
|
|
| exporter.export( |
| nemo_checkpoint=checkpoint_path, |
| model_dir=model_dir, |
| model_type=model_type, |
| tensor_parallel_size=tp_size, |
| pipeline_parallel_size=pp_size, |
| max_model_len=max_input_len + max_output_len, |
| gpu_memory_utilization=args.gpu_memory_utilization, |
| **vllm_export_kwargs, |
| ) |
| else: |
| exporter = TensorRTLLM(model_dir, lora_ckpt_list, load_model=False) |
| if use_huggingface: |
| exporter.export_hf_model( |
| hf_model_path=checkpoint_path, |
| max_batch_size=max_batch_size, |
| tensor_parallelism_size=tp_size, |
| max_input_len=max_input_len, |
| max_num_tokens=max_num_tokens, |
| model_type=model_type, |
| ) |
| else: |
| exporter.export( |
| nemo_checkpoint_path=checkpoint_path, |
| model_type=model_type, |
| tensor_parallelism_size=tp_size, |
| pipeline_parallelism_size=pp_size, |
| max_input_len=max_input_len, |
| max_seq_len=(max_input_len + max_output_len), |
| max_batch_size=max_batch_size, |
| use_parallel_embedding=use_parallel_embedding, |
| max_prompt_embedding_table_size=max_prompt_embedding_table_size, |
| use_lora_plugin=use_lora_plugin, |
| lora_target_modules=lora_target_modules, |
| max_num_tokens=max_num_tokens, |
| use_embedding_sharing=use_embedding_sharing, |
| fp8_quantized=fp8_quantized, |
| fp8_kvcache=fp8_kvcache, |
| **trt_llm_export_kwargs, |
| ) |
|
|
| if ptuning: |
| exporter.add_prompt_table( |
| task_name="0", |
| prompt_embeddings_checkpoint_path=prompt_embeddings_checkpoint_path, |
| ) |
|
|
| output = exporter.forward( |
| input_texts=prompts, |
| max_output_len=max_output_len, |
| top_k=top_k, |
| top_p=top_p, |
| temperature=temperature, |
| task_ids=task_ids, |
| lora_uids=lora_uids, |
| streaming=streaming, |
| stop_words_list=stop_words_list, |
| ) |
|
|
| |
| output = list(output) |
|
|
| functional_result = FunctionalResult() |
|
|
| |
| if args.functional_test: |
| functional_result.regular_pass = True |
| if not check_model_outputs(streaming, output, expected_outputs): |
| LOGGER.warning("Model outputs don't match the expected result.") |
| functional_result.regular_pass = False |
|
|
| output_cpp = "" |
| if test_cpp_runtime and not use_lora_plugin and not ptuning and not use_vllm: |
| |
| exporter_cpp = TensorRTLLM( |
| model_dir, |
| load_model=True, |
| use_python_runtime=False, |
| ) |
|
|
| output_cpp = exporter_cpp.forward( |
| input_texts=prompts, |
| max_output_len=max_output_len, |
| top_k=top_k, |
| top_p=top_p, |
| temperature=temperature, |
| ) |
|
|
| nq = None |
| nm = None |
| output_deployed = "" |
| if test_deployment: |
| nm = DeployPyTriton( |
| model=exporter, |
| triton_model_name=model_name, |
| http_port=8000, |
| ) |
| nm.deploy() |
| nm.run() |
| nq = NemoQueryLLM(url="localhost:8000", model_name=model_name) |
|
|
| output_deployed = nq.query_llm( |
| prompts=prompts, |
| max_output_len=max_output_len, |
| top_k=1, |
| top_p=0.0, |
| temperature=1.0, |
| lora_uids=lora_uids, |
| ) |
|
|
| |
| output_deployed = list(output_deployed) |
|
|
| |
| if args.functional_test: |
| functional_result.deployed_pass = True |
| if not check_model_outputs(streaming, output_deployed, expected_outputs): |
| LOGGER.warning("Deployed model outputs don't match the expected result.") |
| functional_result.deployed_pass = False |
|
|
| if debug or functional_result.regular_pass == False or functional_result.deployed_pass == False: |
| print("") |
| print("--- Prompt: ", prompts) |
| print("") |
| print("--- Expected keywords: ", expected_outputs) |
| print("") |
| print("--- Output: ", output) |
| print("") |
| print("--- Output deployed: ", output_deployed) |
| print("") |
| print("") |
| print("--- Output with C++ runtime: ", output_cpp) |
| print("") |
|
|
| accuracy_result = None |
| if run_accuracy: |
| print("Start model accuracy testing ...") |
| accuracy_result = get_accuracy_with_lambada(exporter, nq, task_ids, lora_uids, test_data_path) |
|
|
| if test_deployment: |
| nm.stop() |
|
|
| if not save_engine and model_dir: |
| shutil.rmtree(model_dir) |
|
|
| return (functional_result, accuracy_result) |
| else: |
| raise Exception("Checkpoint {0} could not be found.".format(checkpoint_path)) |
|
|
|
|
| def run_in_framework_inference( |
| model_name, |
| prompts, |
| checkpoint_path, |
| num_gpus=1, |
| max_output_len=128, |
| top_k=1, |
| top_p=0.0, |
| temperature=1.0, |
| run_accuracy=False, |
| debug=True, |
| test_data_path=None, |
| enable_flash_decode=True, |
| legacy_ckpt=False, |
| ) -> Tuple[Optional[FunctionalResult], Optional[AccuracyResult]]: |
| if Path(checkpoint_path).exists(): |
| if debug: |
| print("") |
| print("") |
| print( |
| "################################################## NEW TEST ##################################################" |
| ) |
| print("") |
|
|
| print("Path: {0} and model: {1} will be tested".format(checkpoint_path, model_name)) |
|
|
| deployed_model = MegatronLLMDeploy.get_deployable( |
| checkpoint_path, num_gpus, enable_flash_decode=enable_flash_decode, legacy_ckpt=legacy_ckpt |
| ) |
|
|
| nm = DeployPyTriton( |
| model=deployed_model, |
| triton_model_name=model_name, |
| http_port=8000, |
| ) |
| nm.deploy() |
| nm.run() |
| nq = NemoQueryLLMPyTorch(url="localhost:8000", model_name=model_name) |
|
|
| output_deployed = nq.query_llm( |
| prompts=prompts, top_k=top_k, top_p=top_p, temperature=temperature, max_length=max_output_len |
| ) |
| output_deployed = output_deployed["choices"][0]["text"] |
|
|
| |
| output_deployed = list(output_deployed) |
| print("\n --------- Output: ", output_deployed) |
|
|
| accuracy_result = None |
| if run_accuracy: |
| print("Start model accuracy testing ...") |
| |
| accuracy_result = get_accuracy_with_lambada(deployed_model, nq, None, None, test_data_path) |
|
|
| nm.stop() |
|
|
| return (None, accuracy_result) |
| else: |
| raise Exception("Checkpoint {0} could not be found.".format(checkpoint_path)) |
|
|
|
|
| def get_args(): |
| parser = argparse.ArgumentParser( |
| formatter_class=argparse.ArgumentDefaultsHelpFormatter, |
| description=f"Deploy nemo models to Triton and benchmark the models", |
| ) |
| parser.add_argument( |
| "--model_name", |
| type=str, |
| required=True, |
| ) |
| parser.add_argument( |
| "--model_type", |
| type=str, |
| required=False, |
| ) |
| parser.add_argument( |
| "--min_tps", |
| type=int, |
| default=1, |
| required=True, |
| ) |
| parser.add_argument( |
| "--max_tps", |
| type=int, |
| ) |
| parser.add_argument( |
| "--pps", |
| type=int, |
| default=1, |
| ) |
| parser.add_argument( |
| "--checkpoint_dir", |
| type=str, |
| default="/tmp/nemo_checkpoint/", |
| required=False, |
| ) |
| parser.add_argument( |
| "--model_dir", |
| type=str, |
| ) |
| parser.add_argument( |
| "--max_batch_size", |
| type=int, |
| default=8, |
| ) |
| parser.add_argument( |
| "--max_input_len", |
| type=int, |
| default=256, |
| ) |
| parser.add_argument( |
| "--max_output_len", |
| type=int, |
| default=128, |
| ) |
| parser.add_argument( |
| "--max_num_tokens", |
| type=int, |
| ) |
| parser.add_argument( |
| "--use_parallel_embedding", |
| type=str, |
| default="False", |
| ) |
| parser.add_argument( |
| "--p_tuning_checkpoint", |
| type=str, |
| ) |
| parser.add_argument( |
| "--ptuning", |
| type=str, |
| default="False", |
| ) |
| parser.add_argument( |
| "--lora_checkpoint", |
| type=str, |
| ) |
| parser.add_argument( |
| "--lora", |
| type=str, |
| default="False", |
| ) |
| parser.add_argument( |
| "--top_k", |
| type=int, |
| default=1, |
| ) |
| parser.add_argument( |
| "--top_p", |
| type=float, |
| default=0.0, |
| ) |
| parser.add_argument( |
| "--temperature", |
| type=float, |
| default=1.0, |
| ) |
| parser.add_argument( |
| "--run_accuracy", |
| type=str, |
| default="False", |
| ) |
| parser.add_argument( |
| "--accuracy_threshold", |
| type=float, |
| default=0.5, |
| ) |
| parser.add_argument("--streaming", default=False, action="store_true") |
| parser.add_argument( |
| "--test_cpp_runtime", |
| type=str, |
| default="False", |
| ) |
| parser.add_argument( |
| "--test_deployment", |
| type=str, |
| default="False", |
| ) |
| parser.add_argument( |
| "--functional_test", |
| type=str, |
| default="False", |
| ) |
| parser.add_argument( |
| "--debug", |
| default=False, |
| action='store_true', |
| ) |
| parser.add_argument( |
| "--test_data_path", |
| type=str, |
| default=None, |
| ) |
| parser.add_argument( |
| "--save_engine", |
| type=str, |
| default="False", |
| ) |
| parser.add_argument( |
| "--use_vllm", |
| type=str, |
| default="False", |
| ) |
| parser.add_argument( |
| "--use_huggingface", |
| type=str, |
| default="False", |
| ) |
| parser.add_argument( |
| "--enable_flash_decode", |
| type=str, |
| default="False", |
| ) |
| parser.add_argument( |
| "--in_framework", |
| type=str, |
| default="False", |
| ) |
| parser.add_argument( |
| "--legacy_ckpt", |
| type=str, |
| default="False", |
| help="Load checkpoint saved with TE < 1.14 (only for in-framework inference)", |
| ) |
| parser.add_argument( |
| "-gmu", |
| '--gpu_memory_utilization', |
| default=0.95, |
| type=float, |
| help="GPU memory utilization percentage for vLLM.", |
| ) |
| parser.add_argument( |
| "-fp8", |
| "--export_fp8_quantized", |
| default="auto", |
| type=str, |
| help="Enables exporting to a FP8-quantized TRT LLM checkpoint", |
| ) |
| parser.add_argument( |
| "-kv_fp8", |
| "--use_fp8_kv_cache", |
| default="auto", |
| type=str, |
| help="Enables exporting with FP8-quantizatized KV-cache", |
| ) |
| parser.add_argument( |
| "--trt_llm_export_kwargs", |
| default={}, |
| type=json.loads, |
| help="Extra keyword arguments passed to TensorRTLLM.export", |
| ) |
| parser.add_argument( |
| "--vllm_export_kwargs", |
| default={}, |
| type=json.loads, |
| help="Extra keyword arguments passed to vLLMExporter.export", |
| ) |
|
|
| args = parser.parse_args() |
|
|
| def str_to_bool(name: str, s: str, optional: bool = False) -> Optional[bool]: |
| s = s.lower() |
| true_strings = ["true", "1"] |
| false_strings = ["false", "0"] |
| if s == '': |
| return False |
| if s in true_strings: |
| return True |
| if s in false_strings: |
| return False |
| if optional and s == 'auto': |
| return None |
| raise UsageError(f"Invalid boolean value for argument --{name}: '{s}'") |
|
|
| args.model_type = None if str(args.model_type).lower() == "none" else args.model_type |
| args.test_cpp_runtime = str_to_bool("test_cpp_runtime", args.test_cpp_runtime) |
| args.test_deployment = str_to_bool("test_deployment", args.test_deployment) |
| args.functional_test = str_to_bool("functional_test", args.functional_test) |
| args.save_engine = str_to_bool("save_engine", args.save_engine) |
| args.run_accuracy = str_to_bool("run_accuracy", args.run_accuracy) |
| args.use_vllm = str_to_bool("use_vllm", args.use_vllm) |
| args.use_huggingface = str_to_bool("use_huggingface", args.use_huggingface) |
| args.enable_flash_decode = str_to_bool("enable_flash_decode", args.enable_flash_decode) |
| args.lora = str_to_bool("lora", args.lora) |
| args.ptuning = str_to_bool("ptuning", args.ptuning) |
| args.use_parallel_embedding = str_to_bool("use_parallel_embedding", args.use_parallel_embedding) |
| args.in_framework = str_to_bool("in_framework", args.in_framework) |
| args.export_fp8_quantized = str_to_bool("export_fp8_quantized", args.export_fp8_quantized, optional=True) |
| args.use_fp8_kv_cache = str_to_bool("use_fp8_kv_cache", args.use_fp8_kv_cache, optional=True) |
| args.legacy_ckpt = str_to_bool("legacy_ckpt", args.legacy_ckpt) |
|
|
| return args |
|
|
|
|
| def run_inference_tests(args): |
| if not args.use_vllm and not args.in_framework and not trt_llm_supported: |
| raise UsageError("TensorRT-LLM engine is not supported in this environment.") |
|
|
| if args.use_vllm and not vllm_supported: |
| raise UsageError("vLLM engine is not supported in this environment.") |
|
|
| if args.in_framework and not in_framework_supported: |
| raise UsageError("In-framework inference is not supported in this environment.") |
|
|
| if args.use_vllm and (args.ptuning or args.lora): |
| raise UsageError("The vLLM integration currently does not support P-tuning or LoRA.") |
|
|
| if args.test_deployment and not triton_supported: |
| raise UsageError("Deployment tests are not available because Triton is not supported in this environment.") |
|
|
| if args.run_accuracy and args.test_data_path is None: |
| raise UsageError("Accuracy testing requires the --test_data_path argument.") |
|
|
| if args.max_tps is None: |
| args.max_tps = args.min_tps |
|
|
| if args.use_vllm and args.min_tps != args.max_tps: |
| raise UsageError( |
| "vLLM doesn't support changing tensor parallel group size without relaunching the process. " |
| "Use the same value for --min_tps and --max_tps." |
| ) |
|
|
| if args.debug: |
| LOGGER.setLevel(logging.DEBUG) |
|
|
| result_dic: Dict[int, Tuple[FunctionalResult, Optional[AccuracyResult]]] = {} |
|
|
| if not args.in_framework and args.model_dir is None: |
| raise Exception("When using custom checkpoints, --model_dir is required.") |
|
|
| prompts = ["The capital of France is", "Largest animal in the sea is"] |
| expected_outputs = ["Paris", "blue whale"] |
| tps = args.min_tps |
|
|
| while tps <= args.max_tps: |
| if args.in_framework: |
| result_dic[tps] = run_in_framework_inference( |
| model_name=args.model_name, |
| prompts=prompts, |
| checkpoint_path=args.checkpoint_dir, |
| num_gpus=tps, |
| max_output_len=args.max_output_len, |
| top_k=args.top_k, |
| top_p=args.top_p, |
| temperature=args.temperature, |
| run_accuracy=args.run_accuracy, |
| debug=args.debug, |
| test_data_path=args.test_data_path, |
| enable_flash_decode=args.enable_flash_decode, |
| legacy_ckpt=args.legacy_ckpt, |
| ) |
| else: |
| result_dic[tps] = run_inference( |
| model_name=args.model_name, |
| model_type=args.model_type, |
| prompts=prompts, |
| expected_outputs=expected_outputs, |
| checkpoint_path=args.checkpoint_dir, |
| model_dir=args.model_dir, |
| use_vllm=args.use_vllm, |
| use_huggingface=args.use_huggingface, |
| tp_size=tps, |
| pp_size=args.pps, |
| max_batch_size=args.max_batch_size, |
| max_input_len=args.max_input_len, |
| max_output_len=args.max_output_len, |
| max_num_tokens=args.max_num_tokens, |
| use_parallel_embedding=args.use_parallel_embedding, |
| ptuning=args.ptuning, |
| p_tuning_checkpoint=args.p_tuning_checkpoint, |
| lora=args.lora, |
| lora_checkpoint=args.lora_checkpoint, |
| top_k=args.top_k, |
| top_p=args.top_p, |
| temperature=args.temperature, |
| run_accuracy=args.run_accuracy, |
| debug=args.debug, |
| streaming=args.streaming, |
| test_deployment=args.test_deployment, |
| test_cpp_runtime=args.test_cpp_runtime, |
| test_data_path=args.test_data_path, |
| save_engine=args.save_engine, |
| fp8_quantized=args.export_fp8_quantized, |
| fp8_kvcache=args.use_fp8_kv_cache, |
| trt_llm_export_kwargs=args.trt_llm_export_kwargs, |
| vllm_export_kwargs=args.vllm_export_kwargs, |
| ) |
|
|
| tps = tps * 2 |
|
|
| functional_test_result = "PASS" |
| accuracy_test_result = "PASS" |
| print_separator = False |
| print("============= Test Summary ============") |
| |
| deployed_tests_only = args.in_framework and args.max_tps > 1 |
| for num_tps, results in result_dic.items(): |
| functional_result, accuracy_result = results |
|
|
| if print_separator: |
| print("---------------------------------------") |
| print_separator = True |
|
|
| def optional_bool_to_pass_fail(b: Optional[bool]): |
| if b is None: |
| return "N/A" |
| return "PASS" if b else "FAIL" |
|
|
| print(f"Tensor Parallelism: {num_tps}") |
|
|
| if args.functional_test and functional_result is not None: |
| print(f"Functional Test: {optional_bool_to_pass_fail(functional_result.regular_pass)}") |
| print(f"Deployed Functional Test: {optional_bool_to_pass_fail(functional_result.deployed_pass)}") |
|
|
| if functional_result.regular_pass == False: |
| functional_test_result = "FAIL" |
| if functional_result.deployed_pass == False: |
| functional_test_result = "FAIL" |
|
|
| if args.run_accuracy and accuracy_result is not None: |
| print(f"Model Accuracy: {accuracy_result.accuracy:.4f}") |
| print(f"Relaxed Model Accuracy: {accuracy_result.accuracy_relaxed:.4f}") |
| print(f"Deployed Model Accuracy: {accuracy_result.deployed_accuracy:.4f}") |
| print(f"Deployed Relaxed Model Accuracy: {accuracy_result.deployed_accuracy_relaxed:.4f}") |
| print(f"Evaluation Time [s]: {accuracy_result.evaluation_time:.2f}") |
| if (deployed_tests_only and accuracy_result.deployed_accuracy_relaxed < args.accuracy_threshold) or ( |
| not deployed_tests_only and accuracy_result.accuracy_relaxed < args.accuracy_threshold |
| ): |
| accuracy_test_result = "FAIL" |
|
|
| print("=======================================") |
| if args.functional_test: |
| print(f"Functional: {functional_test_result}") |
| if args.run_accuracy: |
| print(f"Acccuracy: {accuracy_test_result}") |
|
|
| if functional_test_result == "FAIL": |
| raise Exception("Functional test failed") |
|
|
| if accuracy_test_result == "FAIL": |
| raise Exception(f"Model accuracy is below {args.accuracy_threshold}") |
|
|
|
|
| if __name__ == '__main__': |
| try: |
| args = get_args() |
| run_inference_tests(args) |
| except UsageError as e: |
| LOGGER.error(f"{e}") |
| raise e |
| except argparse.ArgumentError as e: |
| LOGGER.error(f"{e}") |
| raise e |
|
|