| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| import os |
| import re |
|
|
| import pytest |
| import torch |
|
|
|
|
| @pytest.mark.run_only_on('GPU') |
| @pytest.mark.unit |
| def test_get_nemo_to_trtllm_conversion_dict_on_nemo_model(): |
| try: |
| from nemo.export.tensorrt_llm import TensorRTLLM |
| except ImportError: |
| pytest.skip("Could not import TRTLLM helpers. tensorrt_llm is likely not installed") |
| return |
|
|
| dummy_state = object() |
| model_state_dict = { |
| 'model.embedding.word_embeddings.weight': dummy_state, |
| 'model.decoder.layers.0.self_attention.linear_proj.weight': dummy_state, |
| } |
| nemo_model_conversion_dict = TensorRTLLM.get_nemo_to_trtllm_conversion_dict(model_state_dict) |
|
|
| |
| |
| |
| |
| |
| pattern = re.compile(r'^model\.[^.].*') |
| for key in nemo_model_conversion_dict.keys(): |
| assert pattern.match(key), f"Key '{key}' does not properly start with 'model.'" |
|
|
|
|
| @pytest.mark.run_only_on('GPU') |
| @pytest.mark.unit |
| def test_get_nemo_to_trtllm_conversion_dict_on_mcore_model(): |
| try: |
| from megatron.core.export.trtllm.model_to_trllm_mapping.default_conversion_dict import DEFAULT_CONVERSION_DICT |
|
|
| from nemo.export.tensorrt_llm import TensorRTLLM |
| except ImportError: |
| pytest.skip("Could not import TRTLLM helpers. tensorrt_llm is likely not installed") |
| return |
|
|
| dummy_state = object() |
| model_state_dict = { |
| 'embedding.word_embeddings.weight': dummy_state, |
| 'decoder.layers.0.self_attention.linear_proj.weight': dummy_state, |
| } |
| nemo_model_conversion_dict = TensorRTLLM.get_nemo_to_trtllm_conversion_dict(model_state_dict) |
|
|
| |
| assert nemo_model_conversion_dict == DEFAULT_CONVERSION_DICT |
|
|
|
|
| @pytest.mark.run_only_on('GPU') |
| @pytest.mark.unit |
| def test_tensorrt_llm_initialization(): |
| try: |
| from nemo.export.tensorrt_llm import TensorRTLLM |
| except ImportError: |
| pytest.skip("Could not import TRTLLM helpers. tensorrt_llm is likely not installed") |
| return |
|
|
| |
| model_dir = "/tmp/test_model_dir" |
| trt_llm = TensorRTLLM(model_dir=model_dir, load_model=False) |
| assert trt_llm.model_dir == model_dir |
| assert trt_llm.engine_dir == os.path.join(model_dir, "trtllm_engine") |
| assert trt_llm.model is None |
| assert trt_llm.tokenizer is None |
| assert trt_llm.config is None |
|
|
| |
| lora_ckpt_list = ["/path/to/lora1", "/path/to/lora2"] |
| trt_llm = TensorRTLLM(model_dir=model_dir, lora_ckpt_list=lora_ckpt_list, load_model=False) |
| assert trt_llm.lora_ckpt_list == lora_ckpt_list |
|
|
| |
| trt_llm = TensorRTLLM( |
| model_dir=model_dir, |
| use_python_runtime=False, |
| enable_chunked_context=False, |
| max_tokens_in_paged_kv_cache=None, |
| load_model=False, |
| ) |
| assert trt_llm.use_python_runtime is False |
| assert trt_llm.enable_chunked_context is False |
| assert trt_llm.max_tokens_in_paged_kv_cache is None |
|
|
|
|
| @pytest.mark.run_only_on('GPU') |
| @pytest.mark.unit |
| def test_tensorrt_llm_supported_models(): |
| try: |
| from nemo.export.tensorrt_llm import TensorRTLLM |
| except ImportError: |
| pytest.skip("Could not import TRTLLM helpers. tensorrt_llm is likely not installed") |
| return |
|
|
| model_dir = "/tmp/test_model_dir" |
| trt_llm = TensorRTLLM(model_dir=model_dir, load_model=False) |
|
|
| |
| supported_models = trt_llm.get_supported_models_list |
| assert isinstance(supported_models, list) |
| assert len(supported_models) > 0 |
| assert all(isinstance(model, str) for model in supported_models) |
|
|
| |
| hf_mapping = trt_llm.get_supported_hf_model_mapping |
| assert isinstance(hf_mapping, dict) |
| assert len(hf_mapping) > 0 |
|
|
|
|
| @pytest.mark.run_only_on('GPU') |
| @pytest.mark.unit |
| def test_tensorrt_llm_input_dtype(): |
| try: |
| from nemo.export.tensorrt_llm import TensorRTLLM |
| except ImportError: |
| pytest.skip("Could not import TRTLLM helpers. tensorrt_llm is likely not installed") |
| return |
|
|
| model_dir = "/tmp/test_model_dir" |
| trt_llm = TensorRTLLM(model_dir=model_dir, load_model=False) |
|
|
| from megatron.core.export.data_type import DataType |
|
|
| |
| test_cases = [ |
| (torch.float32, DataType.float32), |
| (torch.float16, DataType.float16), |
| (torch.bfloat16, DataType.bfloat16), |
| ] |
|
|
| for storage_dtype, expected_dtype in test_cases: |
| input_dtype = trt_llm.get_input_dtype(storage_dtype) |
| assert input_dtype == expected_dtype, f"Expected {expected_dtype} for {storage_dtype}, got {input_dtype}" |
|
|
|
|
| @pytest.mark.run_only_on('GPU') |
| @pytest.mark.unit |
| def test_tensorrt_llm_hidden_size(): |
| try: |
| from nemo.export.tensorrt_llm import TensorRTLLM |
| except ImportError: |
| pytest.skip("Could not import TRTLLM helpers. tensorrt_llm is likely not installed") |
| return |
|
|
| model_dir = "/tmp/test_model_dir" |
| trt_llm = TensorRTLLM(model_dir=model_dir, load_model=False) |
|
|
| |
| hidden_size = trt_llm.get_hidden_size |
| if hidden_size is not None: |
| assert isinstance(hidden_size, int) |
| assert hidden_size > 0 |
| else: |
| assert hidden_size is None |
|
|
|
|
| @pytest.mark.run_only_on('GPU') |
| @pytest.mark.unit |
| def test_tensorrt_llm_triton_io(): |
| try: |
| from nemo.export.tensorrt_llm import TensorRTLLM |
| except ImportError: |
| pytest.skip("Could not import TRTLLM helpers. tensorrt_llm is likely not installed") |
| return |
|
|
| model_dir = "/tmp/test_model_dir" |
| trt_llm = TensorRTLLM(model_dir=model_dir, load_model=False) |
|
|
| |
| triton_input = trt_llm.get_triton_input |
| assert isinstance(triton_input, tuple) |
| assert triton_input[0].name == "prompts" |
| assert triton_input[1].name == "max_output_len" |
| assert triton_input[2].name == "top_k" |
| assert triton_input[3].name == "top_p" |
| assert triton_input[4].name == "temperature" |
| assert triton_input[5].name == "random_seed" |
| assert triton_input[6].name == "stop_words_list" |
| assert triton_input[7].name == "bad_words_list" |
| assert triton_input[8].name == "no_repeat_ngram_size" |
|
|
| |
| triton_output = trt_llm.get_triton_output |
| assert isinstance(triton_output, tuple) |
| assert triton_output[0].name == "outputs" |
| assert triton_output[1].name == "generation_logits" |
| assert triton_output[2].name == "context_logits" |
|
|
|
|
| @pytest.mark.run_only_on('GPU') |
| @pytest.mark.unit |
| def test_tensorrt_llm_pad_logits(): |
| try: |
| from nemo.export.tensorrt_llm import TensorRTLLM |
| except ImportError: |
| pytest.skip("Could not import TRTLLM helpers. tensorrt_llm is likely not installed") |
| return |
|
|
| model_dir = "/tmp/test_model_dir" |
| trt_llm = TensorRTLLM(model_dir=model_dir, load_model=False) |
|
|
| |
| batch_size = 2 |
| seq_len = 3 |
| vocab_size = 1000 |
| logits = torch.randn(batch_size, seq_len, vocab_size) |
|
|
| |
| padded_logits = trt_llm._pad_logits(logits) |
| assert isinstance(padded_logits, torch.Tensor) |
| assert padded_logits.shape[0] == batch_size |
| assert padded_logits.shape[1] == seq_len |
| assert padded_logits.shape[2] >= vocab_size |
|
|