| def _test_client(port=8000): |
| import time |
| import aiohttp |
| from swift.llm import InferClient, InferRequest, RequestConfig, load_dataset, run_deploy |
| dataset = load_dataset(['AI-ModelScope/alpaca-gpt4-data-zh#1000'], num_proc=4) |
| infer_client = InferClient(port=port) |
| while True: |
| try: |
| infer_client.models |
| break |
| except Exception: |
| time.sleep(1) |
| pass |
| infer_requests = [] |
| for data in dataset[0]: |
| infer_requests.append(InferRequest(**data)) |
| request_config = RequestConfig(seed=42, max_tokens=256, temperature=0.8) |
|
|
| resp = infer_client.infer(infer_requests, request_config=request_config, use_tqdm=False) |
| print(len(resp)) |
|
|
|
|
| def _test(infer_backend): |
| import os |
| os.environ['CUDA_VISIBLE_DEVICES'] = '0' |
|
|
| from swift.llm import DeployArguments |
| from swift.llm import run_deploy |
| args = DeployArguments(model='Qwen/Qwen2-7B-Instruct', infer_backend=infer_backend, verbose=False) |
| with run_deploy(args) as port: |
| _test_client(port) |
|
|
|
|
| def test_vllm(): |
| _test('vllm') |
|
|
|
|
| def test_lmdeploy(): |
| _test('lmdeploy') |
|
|
|
|
| def test_pt(): |
| _test('pt') |
|
|
|
|
| def test_vllm_origin(): |
| import subprocess |
| import sys |
| from modelscope import snapshot_download |
| model_dir = snapshot_download('Qwen/Qwen2-7B-Instruct') |
| args = [sys.executable, '-m', 'vllm.entrypoints.openai.api_server', '--model', model_dir] |
| process = subprocess.Popen(args) |
| _test_client() |
| process.terminate() |
|
|
|
|
| if __name__ == '__main__': |
| |
| |
| test_lmdeploy() |
| |
|
|