| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
|
|
| from unittest.mock import MagicMock, patch |
|
|
| import numpy as np |
| import pytest |
| import torch |
| from transformers import AutoModelForCausalLM, AutoTokenizer |
|
|
| from nemo.deploy.nlp.hf_deployable import HuggingFaceLLMDeploy |
|
|
|
|
| @pytest.fixture |
| def mock_model(): |
| model = MagicMock(spec=AutoModelForCausalLM) |
| model.generate = MagicMock() |
| model.generate.return_value = torch.tensor([[1, 2, 3]]) |
| model.cuda = MagicMock(return_value=model) |
| return model |
|
|
|
|
| @pytest.fixture |
| def mock_tokenizer(): |
| tokenizer = MagicMock(spec=AutoTokenizer) |
| tokenizer.pad_token = "[PAD]" |
| tokenizer.eos_token = "[EOS]" |
| tokenizer.batch_decode = MagicMock(return_value=["Generated text"]) |
| tokenizer.return_value = {"input_ids": torch.tensor([[1, 2, 3]]), "attention_mask": torch.tensor([[1, 1, 1]])} |
| return tokenizer |
|
|
|
|
| @pytest.fixture |
| def mock_peft_model(): |
| with patch("nemo.deploy.nlp.hf_deployable.PeftModel") as mock: |
| mock.from_pretrained.return_value = MagicMock() |
| yield mock |
|
|
|
|
| @pytest.fixture |
| def mock_distributed(): |
| with patch("torch.distributed") as mock: |
| mock.is_initialized.return_value = True |
| mock.get_world_size.return_value = 2 |
| mock.get_rank.return_value = 1 |
| mock.broadcast = MagicMock(return_value=torch.tensor([0])) |
| yield mock |
|
|
|
|
| @pytest.fixture |
| def mock_torch_cuda(): |
| with patch('torch.cuda.is_available', return_value=False): |
| with patch('torch.Tensor.cuda', return_value=torch.tensor([[1, 2, 3]])): |
| yield |
|
|
|
|
| class MockRequest: |
| def __init__(self, data): |
| self.data = data |
| self.span = None |
|
|
| def __getitem__(self, key): |
| return self.data[key] |
|
|
| def keys(self): |
| return self.data.keys() |
|
|
| def values(self): |
| return self.data.values() |
|
|
|
|
| class TestHuggingFaceLLMDeploy: |
|
|
| def test_initialization_invalid_task(self): |
| with pytest.raises(AssertionError): |
| HuggingFaceLLMDeploy(hf_model_id_path="test/model", task="invalid-task") |
|
|
| def test_initialization_no_model(self): |
| with pytest.raises(ValueError): |
| HuggingFaceLLMDeploy(task="text-generation") |
|
|
| def test_initialization_with_model_and_tokenizer(self): |
| model = MagicMock(spec=AutoModelForCausalLM) |
| tokenizer = MagicMock(spec=AutoTokenizer) |
| deployer = HuggingFaceLLMDeploy(model=model, tokenizer=tokenizer, task="text-generation") |
| assert deployer.model == model |
| assert deployer.tokenizer == tokenizer |
| assert deployer.task == "text-generation" |
|
|
| def test_initialization_with_model_path(self, mock_model, mock_tokenizer): |
| with ( |
| patch("transformers.AutoModelForCausalLM.from_pretrained", return_value=mock_model), |
| patch("transformers.AutoTokenizer.from_pretrained", return_value=mock_tokenizer), |
| ): |
| deployer = HuggingFaceLLMDeploy(hf_model_id_path="test/model", task="text-generation") |
| assert deployer.model == mock_model |
| assert deployer.tokenizer == mock_tokenizer |
|
|
| def test_initialization_with_peft_model(self, mock_model, mock_tokenizer, mock_peft_model): |
| with ( |
| patch("transformers.AutoModelForCausalLM.from_pretrained", return_value=mock_model), |
| patch("transformers.AutoTokenizer.from_pretrained", return_value=mock_tokenizer), |
| ): |
| deployer = HuggingFaceLLMDeploy( |
| hf_model_id_path="test/model", hf_peft_model_id_path="test/peft_model", task="text-generation" |
| ) |
| assert deployer.model == mock_peft_model.from_pretrained.return_value |
|
|
| def test_triton_input_output_config(self): |
| deployer = HuggingFaceLLMDeploy(model=MagicMock(), tokenizer=MagicMock(), task="text-generation") |
|
|
| inputs = deployer.get_triton_input |
| outputs = deployer.get_triton_output |
|
|
| assert len(inputs) == 10 |
| assert len(outputs) == 3 |
|
|
| |
| assert any(tensor.name == "prompts" for tensor in inputs) |
| assert any(tensor.name == "max_length" for tensor in inputs) |
|
|
| |
| assert any(tensor.name == "sentences" for tensor in outputs) |
| assert any(tensor.name == "logits" for tensor in outputs) |
| assert any(tensor.name == "scores" for tensor in outputs) |
|
|
| def test_generate_without_model(self): |
| deployer = HuggingFaceLLMDeploy(model=MagicMock(), tokenizer=MagicMock(), task="text-generation") |
| deployer.model = None |
| with pytest.raises(RuntimeError): |
| deployer.generate(text_inputs=["test prompt"]) |
|
|
| def test_generate_with_model(self, mock_model, mock_tokenizer, mock_torch_cuda): |
| deployer = HuggingFaceLLMDeploy(model=mock_model, tokenizer=mock_tokenizer, task="text-generation") |
| output = deployer.generate(text_inputs=["test prompt"]) |
| assert output == ["Generated text"] |
| mock_model.generate.assert_called_once() |
| mock_tokenizer.batch_decode.assert_called_once() |
|
|
| def test_generate_with_output_logits_and_scores(self, mock_model, mock_tokenizer, mock_torch_cuda): |
| mock_model.generate.return_value = { |
| "sequences": torch.tensor([[1, 2, 3]]), |
| "logits": torch.tensor([1.0]), |
| "scores": torch.tensor([0.5]), |
| } |
| deployer = HuggingFaceLLMDeploy(model=mock_model, tokenizer=mock_tokenizer, task="text-generation") |
| output = deployer.generate( |
| text_inputs=["test prompt"], output_logits=True, output_scores=True, return_dict_in_generate=True |
| ) |
| assert isinstance(output, dict) |
| assert "sentences" in output |
| assert "logits" in output |
| assert "scores" in output |
|
|
| def test_triton_infer_fn(self, mock_model, mock_tokenizer): |
| deployer = HuggingFaceLLMDeploy(model=mock_model, tokenizer=mock_tokenizer, task="text-generation") |
| request_data = { |
| "prompts": np.array(["test prompt"]), |
| "temperature": np.array([[1.0]]), |
| "top_k": np.array([[1]]), |
| "top_p": np.array([[0.0]]), |
| "max_length": np.array([[10]]), |
| "output_logits": np.array([[False]]), |
| "output_scores": np.array([[False]]), |
| } |
| requests = [MockRequest(request_data)] |
| output = deployer.triton_infer_fn(requests) |
| assert "sentences" in output[0] |
| assert isinstance(output[0]["sentences"], np.ndarray) |
|
|
| def test_triton_infer_fn_with_error(self, mock_model, mock_tokenizer): |
| deployer = HuggingFaceLLMDeploy(model=mock_model, tokenizer=mock_tokenizer, task="text-generation") |
| mock_model.generate.side_effect = Exception("Test error") |
| request_data = { |
| "prompts": np.array(["test prompt"]), |
| "temperature": np.array([[1.0]]), |
| "top_k": np.array([[1]]), |
| "top_p": np.array([[0.0]]), |
| "max_length": np.array([[10]]), |
| "output_logits": np.array([[False]]), |
| "output_scores": np.array([[False]]), |
| } |
| requests = [MockRequest(request_data)] |
| output = deployer.triton_infer_fn(requests) |
| assert "sentences" in output[0] |
| assert "An error occurred" in str(output[0]["sentences"][0]) |
|
|