| from transformers import AutoTokenizer, AutoModelForCausalLM |
| import torch |
| from tqdm import tqdm |
|
|
| |
| DEVICE = "cuda:1" |
| NUM_RUNS = 10 |
| MAX_NEW_TOKENS = 1000 |
| TEXT_INPUT = "def sieve_of_eratosthenes():" |
|
|
| |
| repo_id = "gg-hf/gemma-2-2b-it" |
| model = AutoModelForCausalLM.from_pretrained(repo_id).to(DEVICE) |
| |
|
|
| assistant_model = None |
| tokenizer = AutoTokenizer.from_pretrained(repo_id, use_fast=True) |
| model_inputs = tokenizer(TEXT_INPUT, return_tensors="pt").to(DEVICE) |
|
|
| generate_kwargs = { |
| "max_new_tokens": MAX_NEW_TOKENS, |
| "do_sample": True, |
| "temperature": 0.2, |
| "eos_token_id": -1 |
| } |
|
|
| |
| print("Warming up...") |
| for _ in range(2): |
| gen_out = model.generate(**model_inputs, **generate_kwargs) |
| print("Done!") |
|
|
|
|
| |
| def measure_generate(model, model_inputs, generate_kwargs): |
| start_event = torch.cuda.Event(enable_timing=True) |
| end_event = torch.cuda.Event(enable_timing=True) |
| torch.cuda.reset_peak_memory_stats(DEVICE) |
| torch.cuda.empty_cache() |
| torch.cuda.synchronize() |
|
|
| start_event.record() |
| for _ in tqdm(range(NUM_RUNS)): |
| gen_out = model.generate(**model_inputs, **generate_kwargs) |
| end_event.record() |
|
|
| torch.cuda.synchronize() |
| max_memory = torch.cuda.max_memory_allocated(DEVICE) |
| print("Max memory (MB): ", max_memory * 1e-6) |
| print("Throughput (tokens/sec): ", (NUM_RUNS * MAX_NEW_TOKENS) / (start_event.elapsed_time(end_event) * 1.0e-3)) |
|
|
| measure_generate(model, model_inputs, generate_kwargs) |