llm-perf-leaderboard / src /llm_perf.py
IlyasMoutawwakil's picture
update
ab5f5f1
raw
history blame
4.24 kB
import os
import pandas as pd
from huggingface_hub import hf_hub_download
LLM_PERF_DATASET_REPO = "optimum/llm-perf-dataset"
HF_TOKEN = os.environ.get("HF_TOKEN", None)
COLUMNS_MAPPING = {
"Model": "Model πŸ€—",
"Arch": "Arch πŸ›οΈ",
"Size": "Params (B)",
"Score": "Open LLM Score (%)",
# deployment settings
"backend.name": "Backend 🏭",
"backend.torch_dtype": "DType πŸ“₯",
"optimization": "Optimization πŸ› οΈ",
"quantization": "Quantization πŸ—œοΈ",
# primary measurements
"forward.latency(s)": "Prefill Latency (s)",
"decode.throughput(tokens/s)": "Decode Throughput (tokens/s)",
"generate.max_memory_allocated(MB)": "Allocated Memory (MB)",
"generate.energy_consumption(tokens/kWh)": "Energy (tokens/kWh)",
# additional measurements
"generate.latency(s)": "E2E Latency (s)",
"generate.throughput(tokens/s)": "E2E Throughput (tokens/s)",
"generate.max_memory_reserved(MB)": "Reserved Memory (MB)",
"generate.max_memory_used(MB)": "Used Memory (MB)",
}
SORTING_COLUMNS = [
"Open LLM Score (%)",
"Prefill Latency (s)",
"Decode Throughput (tokens/s)",
]
SORTING_ASCENDING = [False, True, False]
def get_llm_df():
hf_hub_download(
repo_id=LLM_PERF_DATASET_REPO,
filename="open-llm.csv",
local_dir="dataset",
repo_type="dataset",
token=HF_TOKEN,
)
llm_df = pd.read_csv("dataset/open-llm.csv")
return llm_df
def get_perf_df(machine: str = "hf-dgx-01"):
hf_hub_download(
repo_id=LLM_PERF_DATASET_REPO,
filename=f"{machine}/perf-report.csv",
local_dir="dataset",
repo_type="dataset",
token=HF_TOKEN,
)
perf_df = pd.read_csv(f"dataset/{machine}/perf-report.csv")
return perf_df
def get_llm_perf_df(machine: str = "hf-dgx-01"):
# get dataframes
llm_df = get_llm_df()
perf_df = get_perf_df(machine=machine)
llm_perf_df = pd.merge(llm_df, perf_df, left_on="Model", right_on="model")
# some assertions
assert llm_perf_df["benchmark.input_shapes.batch_size"].nunique() == 1
assert llm_perf_df["benchmark.input_shapes.sequence_length"].nunique() == 1
assert llm_perf_df["benchmark.new_tokens"].nunique() == 1
# transpose energy consumption
llm_perf_df["generate.energy_consumption(tokens/kWh)"] = (
1 / llm_perf_df["generate.energy_consumption(kWh/token)"].fillna(1)
).astype(int)
# fix nan values
llm_perf_df.loc[
llm_perf_df["generate.energy_consumption(tokens/kWh)"] == 1,
"generate.energy_consumption(tokens/kWh)",
] = pd.NA
# add optimization column
llm_perf_df["optimization"] = llm_perf_df[["backend.to_bettertransformer", "backend.use_flash_attention_2"]].apply(
lambda x: "BetterTransformer"
if x["backend.to_bettertransformer"]
else ("FlashAttentionV2" if x["backend.use_flash_attention_2"] else "None"),
axis=1,
)
# add quantization scheme
llm_perf_df["quantization"] = llm_perf_df[
[
"backend.quantization_scheme",
"backend.quantization_config.exllama_config.version",
]
].apply(
lambda x: "BnB.4bit"
if x["backend.quantization_scheme"] == "bnb"
else (
"GPTQ.4bit+ExllamaV1"
if (x["backend.quantization_scheme"] == "gptq")
and (x["backend.quantization_config.exllama_config.version"] == 1)
else (
"GPTQ.4bit+ExllamaV2"
if (x["backend.quantization_scheme"] == "gptq")
and (x["backend.quantization_config.exllama_config.version"] == 2)
else "None"
)
),
axis=1,
)
# add decode throughput
llm_perf_df["decode.throughput(tokens/s)"] = (
1000 / (llm_perf_df["generate.latency(s)"] - llm_perf_df["forward.latency(s)"])
).round(2)
# filter columns
llm_perf_df = llm_perf_df[list(COLUMNS_MAPPING.keys())]
# rename columns
llm_perf_df.rename(columns=COLUMNS_MAPPING, inplace=True)
# sort by metric
llm_perf_df.sort_values(
by=SORTING_COLUMNS,
ascending=SORTING_ASCENDING,
inplace=True,
)
return llm_perf_df