| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
|
|
| import numpy as np |
| from pytriton.decorators import batch |
| from pytriton.model_config import Tensor |
|
|
| from nemo.deploy import DeployPyTriton, ITritonDeployable |
| from nemo.deploy.nlp import NemoQueryLLM |
| from nemo.deploy.utils import cast_output, str_ndarray2list |
|
|
|
|
| class MockModel(ITritonDeployable): |
|
|
| @property |
| def get_triton_input(self): |
| inputs = ( |
| Tensor(name="prompts", shape=(-1,), dtype=bytes), |
| Tensor(name="max_output_len", shape=(-1,), dtype=np.int_, optional=True), |
| Tensor(name="output_context_logits", shape=(-1,), dtype=np.bool_, optional=False), |
| Tensor(name="output_generation_logits", shape=(-1,), dtype=np.bool_, optional=False), |
| ) |
| return inputs |
|
|
| @property |
| def get_triton_output(self): |
| outputs = (Tensor(name="outputs", shape=(-1,), dtype=bytes),) |
| return outputs |
|
|
| @batch |
| def triton_infer_fn(self, **inputs: np.ndarray): |
| infer_input = {"input_texts": str_ndarray2list(inputs.pop("prompts"))} |
| if "max_output_len" in inputs: |
| infer_input["max_output_len"] = inputs.pop("max_output_len")[0][0] |
|
|
| output_dict = dict() |
| output_dict["outputs"] = cast_output("I am good, how about you?", np.bytes_) |
| return output_dict |
|
|
|
|
| def test_nemo_deploy_query(): |
| model_name = "mock_model" |
| model = MockModel() |
| nm = DeployPyTriton( |
| model=model, |
| triton_model_name=model_name, |
| max_batch_size=32, |
| http_port=9002, |
| grpc_port=8001, |
| address="0.0.0.0", |
| allow_grpc=True, |
| allow_http=True, |
| streaming=False, |
| ) |
| nm.deploy() |
| nm.run() |
|
|
| nq = NemoQueryLLM(url="localhost:9002", model_name=model_name) |
| output_deployed = nq.query_llm( |
| prompts=["Hey, how is it going?"], |
| max_output_len=20, |
| ) |
| nm.stop() |
|
|
| assert output_deployed is not None, "Output cannot be none." |
| assert output_deployed == "I am good, how about you?", "Output cannot be none." |
|
|