Spaces:
Running
Running
import os | |
import pandas as pd | |
from huggingface_hub import hf_hub_download | |
LLM_PERF_DATASET_REPO = "optimum/llm-perf-dataset" | |
HF_TOKEN = os.environ.get("HF_TOKEN", None) | |
COLUMNS_MAPPING = { | |
"Model": "Model π€", | |
"Arch": "Arch ποΈ", | |
"Size": "Params (B)", | |
"Score": "Open LLM Score (%)", | |
# deployment settings | |
"backend.name": "Backend π", | |
"backend.torch_dtype": "DType π₯", | |
"optimization": "Optimization π οΈ", | |
"quantization": "Quantization ποΈ", | |
# primary measurements | |
"forward.latency(s)": "Prefill Latency (s)", | |
"decode.throughput(tokens/s)": "Decode Throughput (tokens/s)", | |
"generate.max_memory_allocated(MB)": "Allocated Memory (MB)", | |
"generate.energy_consumption(tokens/kWh)": "Energy (tokens/kWh)", | |
# additional measurements | |
"generate.latency(s)": "E2E Latency (s)", | |
"generate.throughput(tokens/s)": "E2E Throughput (tokens/s)", | |
"generate.max_memory_reserved(MB)": "Reserved Memory (MB)", | |
"generate.max_memory_used(MB)": "Used Memory (MB)", | |
} | |
SORTING_COLUMNS = [ | |
"Open LLM Score (%)", | |
"Prefill Latency (s)", | |
"Decode Throughput (tokens/s)", | |
] | |
SORTING_ASCENDING = [False, True, False] | |
def get_llm_df(): | |
hf_hub_download( | |
repo_id=LLM_PERF_DATASET_REPO, | |
filename="open-llm.csv", | |
local_dir="dataset", | |
repo_type="dataset", | |
token=HF_TOKEN, | |
) | |
llm_df = pd.read_csv("dataset/open-llm.csv") | |
return llm_df | |
def get_perf_df(machine: str = "hf-dgx-01"): | |
hf_hub_download( | |
repo_id=LLM_PERF_DATASET_REPO, | |
filename=f"{machine}/perf-report.csv", | |
local_dir="dataset", | |
repo_type="dataset", | |
token=HF_TOKEN, | |
) | |
perf_df = pd.read_csv(f"dataset/{machine}/perf-report.csv") | |
return perf_df | |
def get_llm_perf_df(machine: str = "hf-dgx-01"): | |
# get dataframes | |
llm_df = get_llm_df() | |
perf_df = get_perf_df(machine=machine) | |
llm_perf_df = pd.merge(llm_df, perf_df, left_on="Model", right_on="model") | |
# some assertions | |
assert llm_perf_df["benchmark.input_shapes.batch_size"].nunique() == 1 | |
assert llm_perf_df["benchmark.input_shapes.sequence_length"].nunique() == 1 | |
assert llm_perf_df["benchmark.new_tokens"].nunique() == 1 | |
# transpose energy consumption | |
llm_perf_df["generate.energy_consumption(tokens/kWh)"] = ( | |
1 / llm_perf_df["generate.energy_consumption(kWh/token)"].fillna(1) | |
).astype(int) | |
# fix nan values | |
llm_perf_df.loc[ | |
llm_perf_df["generate.energy_consumption(tokens/kWh)"] == 1, | |
"generate.energy_consumption(tokens/kWh)", | |
] = pd.NA | |
# add optimization column | |
llm_perf_df["optimization"] = llm_perf_df[["backend.to_bettertransformer", "backend.use_flash_attention_2"]].apply( | |
lambda x: "BetterTransformer" | |
if x["backend.to_bettertransformer"] | |
else ("FlashAttentionV2" if x["backend.use_flash_attention_2"] else "None"), | |
axis=1, | |
) | |
# add quantization scheme | |
llm_perf_df["quantization"] = llm_perf_df[ | |
[ | |
"backend.quantization_scheme", | |
"backend.quantization_config.exllama_config.version", | |
] | |
].apply( | |
lambda x: "BnB.4bit" | |
if x["backend.quantization_scheme"] == "bnb" | |
else ( | |
"GPTQ.4bit+ExllamaV1" | |
if (x["backend.quantization_scheme"] == "gptq") | |
and (x["backend.quantization_config.exllama_config.version"] == 1) | |
else ( | |
"GPTQ.4bit+ExllamaV2" | |
if (x["backend.quantization_scheme"] == "gptq") | |
and (x["backend.quantization_config.exllama_config.version"] == 2) | |
else "None" | |
) | |
), | |
axis=1, | |
) | |
# add decode throughput | |
llm_perf_df["decode.throughput(tokens/s)"] = ( | |
1000 / (llm_perf_df["generate.latency(s)"] - llm_perf_df["forward.latency(s)"]) | |
).round(2) | |
# filter columns | |
llm_perf_df = llm_perf_df[list(COLUMNS_MAPPING.keys())] | |
# rename columns | |
llm_perf_df.rename(columns=COLUMNS_MAPPING, inplace=True) | |
# sort by metric | |
llm_perf_df.sort_values( | |
by=SORTING_COLUMNS, | |
ascending=SORTING_ASCENDING, | |
inplace=True, | |
) | |
return llm_perf_df | |