|
|
|
|
| |
| import json |
|
|
| import pandas as pd |
| import plotly.express as px |
| import plotly.graph_objects as go |
| import plotly.io as pio |
| from plotly.subplots import make_subplots |
|
|
| |
| |
| |
| |
| |
| with open('llm_gpu_benchmarks.json') as f: |
| data = json.load(f) |
| del f |
|
|
| |
| |
| df = pd.json_normalize(data) |
| del data |
|
|
| |
| |
| |
| df.drop(columns=['task', 'ngpus', 'reps', 'date', 'git_sha', 'transformers', 'bitsandbytes', 'cuda', 'hostname', |
| 'summarize_input_len_bytes'], inplace=True) |
| |
| df.rename(columns={'n_gpus': 'gpu_count'}, inplace=True) |
| |
| df["gpu_name"] = df.gpus.str.extract(r'[1-9] x ([\w\- ]+) .+') |
| df["gpu_memory_gb"] = round( |
| pd.to_numeric(df.gpus.str.extract(r'[\w ]+ \(([\d]+) .+', expand=False), errors='coerce') / 1024) |
| df["gpu_memory_gb"] = df["gpu_memory_gb"].astype('Int64') |
| df.drop(columns=['gpus'], inplace=True) |
| |
| df.gpu_name = df.gpu_name.str.replace('NVIDIA ', '') |
| df.gpu_name = df.gpu_name.str.replace('GeForce ', '') |
| df.gpu_name = df.gpu_name.str.replace('A100-SXM4-80GB', 'A100 SXM4') |
| df.gpu_name = df.gpu_memory_gb.astype(str) + "-" + df.gpu_name |
| |
| df.drop(df[df.gpu_name.isnull()].index, inplace=True) |
|
|
| |
| |
| df.drop_duplicates(['backend', 'base_model', 'bits', 'gpu_count', 'gpu_name'], inplace=True) |
|
|
| |
| |
| |
| cpu_summary_out_throughput = 1353 / 1216 |
| cpu_generate_out_throughput = 849 / 180 |
|
|
| |
| df["summary_out_throughput"] = df.summarize_output_len_bytes / df.summarize_time |
| df["generate_out_throughput"] = df.generate_output_len_bytes / df.generate_time |
| |
| df["summary_out_throughput_normalize"] = df.summary_out_throughput / cpu_summary_out_throughput |
| df["generate_out_throughput_normalize"] = df.generate_out_throughput / cpu_generate_out_throughput |
|
|
| |
| |
|
|
| |
| pio.renderers.default = "browser" |
|
|
| |
| bits_bar_colors = {'4': px.colors.qualitative.D3[0], |
| '8': px.colors.qualitative.D3[1], |
| '16': px.colors.qualitative.D3[2]} |
|
|
| backends = list(df.backend.unique()) |
| base_models = list(df.base_model.unique()) |
| n_gpus = list(df.gpu_count.unique()) |
|
|
| |
| for backend in backends: |
| |
| fig_bar = make_subplots(rows=len(n_gpus), |
| cols=len(base_models) * 2, |
| shared_xaxes='all', |
| shared_yaxes='columns', |
| start_cell="top-left", |
| vertical_spacing=0.1, |
| print_grid=False, |
| row_titles=[f'{gpu_count} GPUs' for gpu_count in n_gpus], |
| column_titles=['llama2-7b-chat Summarization', 'llama2-7b-chat Generation', |
| 'llama2-13b-chat Summarization', 'llama2-13b-chat Generation', |
| 'llama2-70b-chat Summarization', 'llama2-70b-chat Generation'],) |
|
|
| |
| for base_model in base_models: |
| for gpu_count in n_gpus: |
| for bits in sorted(df.bits.unique()): |
| sub_df = df[(df.backend == backend) & |
| (df.base_model == base_model) & |
| (df.gpu_count == gpu_count) & |
| (df.bits == bits)].sort_values(by='gpu_name') |
| fig_bar.add_trace(go.Bar(x=sub_df.summary_out_throughput_normalize, |
| y=sub_df.gpu_name, |
| name=f'sum-{bits} bits', |
| legendgroup=f'sum-{bits} bits', |
| marker=dict(color=bits_bar_colors[f'{bits}']), |
| orientation='h'), |
| row=n_gpus.index(gpu_count) + 1, |
| col=base_models.index(base_model) * 2 + 1) |
| fig_bar.add_trace(go.Bar(x=sub_df.generate_out_throughput_normalize, |
| y=sub_df.gpu_name, |
| name=f'gen-{bits} bits', |
| legendgroup=f'gen-{bits} bits', |
| marker=dict(color=bits_bar_colors[f'{bits}']), |
| orientation='h'), |
| row=list(n_gpus).index(gpu_count) + 1, |
| col=list(base_models).index(base_model) * 2 + 2) |
|
|
| fig_bar.update_layout(plot_bgcolor='rgb(250,250,250)', |
| showlegend=True, |
| barmode="group") |
| |
| fig_bar.write_html(f'llm_gpu_benchmark_{backend}.html', include_plotlyjs='cdn') |