| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| import argparse |
| from nemo.collections.llm.api import deploy |
|
|
| |
| |
| |
|
|
|
|
| def get_parser(): |
| parser = argparse.ArgumentParser(description="NeMo2.0 Deployment") |
| parser.add_argument( |
| "--nemo_checkpoint", |
| type=str, |
| help="NeMo 2.0 checkpoint to be evaluated", |
| ), |
| parser.add_argument( |
| "--ngpus", |
| type=int, |
| default=1, |
| help="Num of gpus per node", |
| ), |
| parser.add_argument( |
| "--nnodes", |
| type=int, |
| default=1, |
| help="Num of nodes", |
| ), |
| parser.add_argument( |
| "--tensor_parallelism_size", |
| type=int, |
| default=1, |
| help="Tensor parallelism size to deploy the model", |
| ), |
| parser.add_argument( |
| "--pipeline_parallelism_size", |
| type=int, |
| default=1, |
| help="Pipeline parallelism size to deploy the model", |
| ) |
| parser.add_argument( |
| "--context_parallel_size", |
| type=int, |
| default=1, |
| help="context parallelism size to deploy the model", |
| ) |
| parser.add_argument( |
| "--expert_model_parallel_size", |
| type=int, |
| default=1, |
| help="Expert model parallelism size to deploy the model", |
| ) |
| parser.add_argument( |
| "--expert_tensor_parallel_size", |
| type=int, |
| default=1, |
| help="Expert tensor parallelism size to deploy the model", |
| ) |
| parser.add_argument( |
| "--max_batch_size", |
| type=int, |
| default=8, |
| help="Max batch size for the underlying Triton server", |
| ) |
| parser.add_argument( |
| "--max_input_len", |
| type=int, |
| default=4096, |
| help="Max input length for the underlying Triton server", |
| ) |
| return parser |
|
|
|
|
| if __name__ == "__main__": |
| args = get_parser().parse_args() |
| deploy( |
| nemo_checkpoint=args.nemo_checkpoint, |
| num_gpus=args.ngpus, |
| num_nodes=args.nnodes, |
| fastapi_port=8886, |
| tensor_parallelism_size=args.tensor_parallelism_size, |
| pipeline_parallelism_size=args.pipeline_parallelism_size, |
| context_parallel_size=args.context_parallel_size, |
| expert_model_parallel_size=args.expert_model_parallel_size, |
| expert_tensor_parallel_size=args.expert_tensor_parallel_size, |
| max_batch_size=args.max_batch_size, |
| max_input_len=args.max_input_len, |
| ) |
|
|