Spaces:
Running
Running
"""Gradio app for the ML.ENERGY leaderboard. | |
Everything is in a single file. Search for `gr.Blocks` to find the place | |
where UI elements are actually defined. | |
""" | |
from __future__ import annotations | |
import copy | |
import json | |
import random | |
import yaml | |
import requests | |
import itertools | |
import contextlib | |
import argparse | |
import os | |
from pathlib import Path | |
from abc import abstractmethod | |
from typing import Literal, Any | |
from dateutil import parser, tz | |
import numpy as np | |
import gradio as gr | |
import pandas as pd | |
from spitfight.colosseum.client import ControllerClient | |
COLOSSEUM_UP = True | |
COLOSSEUM_DOWN_MESSAGE = f"<br/><h2 style='text-align: center'>The Colosseum is currently down for maintenance.</h2>" | |
COLOSSUMM_YOUTUBE_DEMO_EMBED_HTML = '<div style="width: 100%; min-width: 400px;"><div style="position: relative; width: 100%; overflow: hidden; padding-top: 56.25%"><p><iframe width="560" height="315" style="margin: auto; position: absolute; top: 0; left: 0; right: 0; width: 100%; height: 100%; border: none;" src="https://www.youtube.com/embed/tvNM_gLffFs?si=rW1-10pt5BffJEGH" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share" allowfullscreen></iframe><p></div></div>' | |
class TableManager: | |
"""Manages the data for the leaderboard tables for tasks.""" | |
def __init__(self, data_dir: str) -> None: | |
"""Load leaderboard data from files in `data_dir`. | |
Expected directory structure: `data_dir/gpu_model`. | |
Inside the innermost (GPU) directory, there should be: | |
- `models.json`: JSON file that maps huggingface model IDs to model info. | |
Some models listed in this file may not have benchmark results. | |
- `model_org/model_name/*.json`: JSON files containing the benchmark results. | |
""" | |
self.data_dir = Path(data_dir) | |
def __str__(self) -> str: | |
return f"{self.__class__}(data_dir={self.data_dir})" | |
def _wrap_model_name(self, url: str, model_name: str) -> str: | |
"""Wrap the model name in an HTML anchor.""" | |
return f'<a style="text-decoration: underline; text-decoration-style: dotted" target="_blank" href="{url}">{model_name}</a>' | |
def _unwrap_model_name(self, model_name: str) -> str: | |
"""Unwrap the model name from an HTML anchor.""" | |
return model_name.split(">")[1].split("<")[0] | |
def get_tab_name(self) -> str: | |
"""Return the name of the leaderboard.""" | |
def get_intro_text(self) -> str: | |
"""Return the introduction text to be inserted above the table.""" | |
def get_detail_text(self, detail_mode: bool) -> str: | |
"""Return the detail text chunk to be inserted below the table.""" | |
def get_benchmark_checkboxes(self) -> dict[str, list[str]]: | |
"""Return data for the benchmark selection checkboxes.""" | |
return {} | |
def get_benchmark_sliders(self) -> dict[str, tuple[float, float, float, float]]: | |
"""Return data for the benchmark selection sliders. | |
Dictionary values are tuples of the form (min, max, step, default). | |
""" | |
return {} | |
def get_all_models(self) -> list[str]: | |
"""Return all available models.""" | |
def set_filter_get_df(self, detail_mode: bool, *filters) -> pd.DataFrame: | |
"""Set the current set of filters and return the filtered DataFrame.""" | |
class LLMTableManager(TableManager): | |
def __init__(self, data_dir: str, task_name: str) -> None: | |
"""Load leaderboard data from files in `data_dir`. | |
Under `data_dir`, there should be: | |
- `models.json`: JSON file that maps huggingface model IDs to model info. | |
Some models listed in this file may not have benchmark results. | |
- `schema.yaml`: YAML file containing the schema of the benchmark. | |
Then, benchmark data files are nested under `data_dir` according to the schema. | |
One directory hierarchy for each choice in the schema and then two more -- the | |
model's HuggingFace hub organization and the model name. | |
""" | |
super().__init__(data_dir) | |
self.task_name = task_name | |
# Read in the data into a Pandas DataFrame. | |
# Important: The ordering `self.schema` determines the directory structure. | |
self.schema = yaml.safe_load(open(self.data_dir / "schema.yaml")) | |
models: dict[str, dict[str, Any]] = json.load( | |
open(self.data_dir / "models.json") | |
) | |
res_df = pd.DataFrame() | |
for choice in itertools.product(*self.schema.values()): | |
result_dir = self.data_dir / "/".join(choice) | |
with contextlib.suppress(FileNotFoundError): | |
for model_id, model_info in models.items(): | |
for file in (result_dir / model_id).glob("*.json"): | |
model_df = pd.DataFrame([json.load(open(file))]) | |
# Sanity checks and standardization of schema values. | |
assert model_df["Model"].iloc[0] == model_id | |
for key, val in zip(self.schema.keys(), choice): | |
assert ( | |
str(val).lower() in str(model_df[key].iloc[0]).lower() | |
) | |
model_df[key] = val | |
# Format the model name as an HTML anchor. | |
model_df["Model"] = self._wrap_model_name(model_info["url"], model_info["nickname"]) | |
model_df["Params (B)"] = model_info["params"] | |
res_df = pd.concat([res_df, model_df]) | |
if res_df.empty: | |
raise ValueError( | |
f"No benchmark JSON files were read from {self.data_dir=}." | |
) | |
# Order columns | |
columns = res_df.columns.to_list() | |
cols_to_order = ["Model", "Params (B)"] | |
cols_to_order.extend(self.schema.keys()) | |
columns = cols_to_order + [col for col in columns if col not in cols_to_order] | |
res_df = res_df[columns] | |
# Order rows | |
res_df = res_df.sort_values(by=["Model", *self.schema.keys(), "Energy/req (J)"]) | |
self.full_df = res_df.round(2) | |
# We need to set the default view separately when `gr.State` is forked. | |
self.set_filter_get_df(detail_mode=False) | |
def get_benchmark_checkboxes(self) -> dict[str, list[str]]: | |
return self.schema | |
def get_benchmark_sliders(self) -> dict[str, tuple[float, float, float, float]]: | |
return {"Target Average TPOT (Time Per Output Token) (s)": (0.0, 0.5, 0.01, 0.2)} | |
def get_all_models(self) -> list[str]: | |
return self.full_df["Model"].apply(self._unwrap_model_name).unique().tolist() | |
def set_filter_get_df(self, detail_mode: bool, *filters) -> pd.DataFrame: | |
"""Set the current set of filters and return the filtered DataFrame. | |
Filters can either be completely empty, or be a concatenated list of | |
choices from all checkboxes and all sliders. | |
""" | |
# If the filter is empty, we default to the first choice for each checkbox. | |
if not filters: | |
checkboxes = [choices[:1] for choices in self.schema.values()] | |
sliders = [slider[3] for slider in self.get_benchmark_sliders().values()] | |
filters = checkboxes + sliders | |
index = np.full(len(self.full_df), True) | |
# Checkboxes | |
for setup, choice in zip(self.schema, filters): | |
index = index & self.full_df[setup].isin(choice) | |
cur_df = self.full_df.loc[index] | |
# Sliders (We just have TPOT for now.) | |
# For each `Model`, we want to first filter out rows whose `Avg TPOT (s)` is greater than the slider value. | |
# Finally, only just leave the row whose `Energy/req (J)` is the smallest. | |
tpot_slo = filters[-1] | |
cur_df = ( | |
cur_df | |
.groupby("Model")[cur_df.columns] | |
.apply(lambda x: x[x["Avg TPOT (s)"] <= tpot_slo], include_groups=True) | |
.sort_values(by="Energy/req (J)") | |
.reset_index(drop=True) | |
.groupby("Model") | |
.head(1) | |
) | |
if not detail_mode: | |
core_columns = ["Model", "Params (B)", "GPU", "Energy/req (J)"] | |
readable_name_mapping = { | |
"Params (B)": "Parameters (Billions)", | |
"GPU": "GPU model", | |
"Energy/req (J)": "Energy per response (Joules)", | |
} | |
cur_df = cur_df[core_columns].rename(columns=readable_name_mapping) | |
return cur_df | |
class LLMChatTableManager(LLMTableManager): | |
"""LLM table manager for chat tasks.""" | |
def get_tab_name(self) -> str: | |
return "LLM Chat" | |
def get_intro_text(self) -> str: | |
text = """ | |
<h2>How much energy do GenAI models consume?</h2> | |
<h3>LLM chatbot response generation</h3> | |
<p style="font-size: 16px"> | |
Large language models (LLMs), especially the instruction-tuned ones, can generate human-like responses to chat prompts. | |
Using <a href="https://ml.energy/zeus">Zeus</a> for energy measurement, we created a leaderboard for LLM chat energy consumption. | |
</p> | |
<p style="font-size: 16px"> | |
More models will be added over time. Stay tuned! | |
</p> | |
""" | |
return text | |
def get_detail_text(self, detail_mode: bool) -> str: | |
if detail_mode: | |
text = """ | |
**TPOT (Time Per Output Token)** is the time between each token generated by LLMs as part of their response. | |
An average TPOT of 0.20 seconds roughly corresponds to a person reading at 240 words per minute and assuming one word is 1.3 tokens on average. | |
You can tweak the TPOT slider to adjust the target average TPOT for the models. | |
Each row corresponds to one model, given a constraint on the maximum average TPOT. | |
If more than one GPU types were chosen, the row shows results from the GPU with the lowest energy consumption per request. | |
Columns | |
- **Model**: The name of the model. | |
- **Params (B)**: Number of parameters in the model. | |
- **GPU**: Name of the GPU model used for benchmarking. | |
- **TP**: Tensor parallelism degree. | |
- **PP**: Pipeline parallelism degree. (TP * PP is the total number of GPUs used.) | |
- **Energy/req (J)**: Energy consumed per request in Joules. | |
- **Avg TPOT (s)**: Average time per output token in seconds. | |
- **Token tput (toks/s)**: Average number of tokens generated by the engine per second. | |
- **Avg Output Tokens**: Average number of output tokens in the LLM's response. | |
- **Avg BS**: Average batch size of the serving engine over time. | |
- **Max BS**: Maximum batch size configuration of the serving engine. | |
For more detailed information, please take a look at the **About** tab. | |
""" | |
else: | |
text = """ | |
Columns | |
- **Model**: The name of the model. | |
- **Parameters (Billions)**: Number of parameters in the model. This is the size of the model. | |
- **GPU model**: Name of the GPU model used for benchmarking. | |
- **Energy per response (Joules)**: Energy consumed for each LLM response in Joules. | |
Checking "Show more technical details" above the table will reveal more detailed columns. | |
Also, for more detailed information, please take a look at the **About** tab. | |
""" | |
return text | |
class LLMCodeTableManager(LLMTableManager): | |
"""LLM table manager for coding tasks.""" | |
def get_tab_name(self) -> str: | |
return "LLM Code" | |
def get_intro_text(self) -> str: | |
text = """ | |
<h2>How much energy do GenAI models consume?</h2> | |
<h3>LLM code generation</h3> | |
<p style="font-size: 16px"> | |
Large language models (LLMs) are also capable of generating code. | |
Using <a href="https://ml.energy/zeus">Zeus</a> for energy measurement, we created a leaderboard for the energy consumption of LLMs specifically trained for code generation. | |
</p> | |
<p style="font-size: 16px"> | |
More models will be added over time. Stay tuned! | |
</p> | |
""" | |
return text | |
def get_detail_text(self, detail_mode: bool) -> str: | |
if detail_mode: | |
text = """ | |
**TPOT (Time Per Output Token)** is the time between each token generated by LLMs as part of their response. | |
An average TPOT of 0.20 seconds roughly corresponds to a person reading at 240 words per minute and assuming one word is 1.3 tokens on average. | |
You can tweak the TPOT slider to adjust the target average TPOT for the models. | |
Each row corresponds to one model, given a constraint on the maximum average TPOT. | |
If more than one GPU types were chosen, the row shows results from the GPU with the lowest energy consumption per request. | |
Columns | |
- **Model**: The name of the model. | |
- **Params (B)**: Number of parameters in the model. | |
- **GPU**: Name of the GPU model used for benchmarking. | |
- **TP**: Tensor parallelism degree. | |
- **PP**: Pipeline parallelism degree. (TP * PP is the total number of GPUs used.) | |
- **Energy/req (J)**: Energy consumed per request in Joules. | |
- **Avg TPOT (s)**: Average time per output token in seconds. | |
- **Token tput (toks/s)**: Average number of tokens generated by the engine per second. | |
- **Avg Output Tokens**: Average number of output tokens in the LLM's response. | |
- **Avg BS**: Average batch size of the serving engine over time. | |
- **Max BS**: Maximum batch size configuration of the serving engine. | |
For more detailed information, please take a look at the **About** tab. | |
""" | |
else: | |
text = """ | |
Columns | |
- **Model**: The name of the model. | |
- **Parameters (Billions)**: Number of parameters in the model. This is the size of the model. | |
- **GPU model**: Name of the GPU model used for benchmarking. | |
- **Energy per response (Joules)**: Energy consumed for each LLM response in Joules. | |
Checking "Show more technical details" above the table will reveal more detailed columns. | |
Also, for more detailed information, please take a look at the **About** tab. | |
""" | |
return text | |
class VLMChatTableManager(LLMTableManager): | |
"""VLM table manager for chat tasks.""" | |
def get_tab_name(self) -> str: | |
return "VLM Visual Chat" | |
def get_intro_text(self) -> str: | |
text = """ | |
<h2>How much energy do GenAI models consume?</h2> | |
<h3>VLM visual chatbot response generation</h3> | |
<p style="font-size: 16px"> | |
Vision language models (VLMs) are large language models that can understand images along with text and generate human-like responses to chat prompts with images. | |
Using <a href="https://ml.energy/zeus">Zeus</a> for energy measurement, we created a leaderboard for VLM chat energy consumption. | |
</p> | |
<p style="font-size: 16px"> | |
More models will be added over time. Stay tuned! | |
</p> | |
""" | |
return text | |
def get_detail_text(self, detail_mode: bool) -> str: | |
if detail_mode: | |
text = """ | |
**TPOT (Time Per Output Token)** is the time between each token generated by LLMs as part of their response. | |
An average TPOT of 0.20 seconds roughly corresponds to a person reading at 240 words per minute and assuming one word is 1.3 tokens on average. | |
You can tweak the TPOT slider to adjust the target average TPOT for the models. | |
Each row corresponds to one model, given a constraint on the maximum average TPOT. | |
If more than one GPU types were chosen, the row shows results from the GPU with the lowest energy consumption per request. | |
Columns | |
- **Model**: The name of the model. | |
- **Params (B)**: Number of parameters in the model. | |
- **GPU**: Name of the GPU model used for benchmarking. | |
- **TP**: Tensor parallelism degree. | |
- **PP**: Pipeline parallelism degree. (TP * PP is the total number of GPUs used.) | |
- **Energy/req (J)**: Energy consumed per request in Joules. | |
- **Avg TPOT (s)**: Average time per output token in seconds. | |
- **Token tput (toks/s)**: Average number of tokens generated by the engine per second. | |
- **Avg Output Tokens**: Average number of output tokens in the LLM's response. | |
- **Avg BS**: Average batch size of the serving engine over time. | |
- **Max BS**: Maximum batch size configuration of the serving engine. | |
For more detailed information, please take a look at the **About** tab. | |
""" | |
else: | |
text = """ | |
Columns | |
- **Model**: The name of the model. | |
- **Parameters (Billions)**: Number of parameters in the model. This is the size of the model. | |
- **GPU model**: Name of the GPU model used for benchmarking. | |
- **Energy per response (Joules)**: Energy consumed for each LLM response in Joules. | |
Checking "Show more technical details" above the table will reveal more detailed columns. | |
Also, for more detailed information, please take a look at the **About** tab. | |
""" | |
return text | |
class DiffusionTableManager(TableManager): | |
def __init__(self, data_dir: str, task_name: str) -> None: | |
"""Load leaderboard data from files in `data_dir`. | |
Under `data_dir`, there should be: | |
- `models.json`: JSON file that maps huggingface model IDs to model info. | |
Some models listed in this file may not have benchmark results. | |
- `schema.yaml`: YAML file containing the schema of the benchmark. | |
Then, benchmark data files are nested under `data_dir` according to the schema. | |
One directory hierarchy for each choice in the schema and then two more -- the | |
model's HuggingFace hub organization and the model name. | |
""" | |
super().__init__(data_dir) | |
self.task_name = task_name | |
if "to video" in task_name.lower(): | |
self.energy_col = "Energy/video (J)" | |
self.energy_col_readable = "Energy per video (Joules)" | |
elif "to image" in task_name.lower(): | |
self.energy_col = "Energy/image (J)" | |
self.energy_col_readable = "Energy per image (Joules)" | |
else: | |
raise ValueError(f"Unknown task name: {task_name=}") | |
# Read in the data into a Pandas DataFrame. | |
# Important: The ordering `self.schema` determines the directory structure. | |
self.schema = yaml.safe_load(open(self.data_dir / "schema.yaml")) | |
models: dict[str, dict[str, Any]] = json.load( | |
open(self.data_dir / "models.json") | |
) | |
res_df = pd.DataFrame() | |
for choice in itertools.product(*self.schema.values()): | |
result_dir = self.data_dir / "/".join(choice) | |
with contextlib.suppress(FileNotFoundError): | |
for model_id, model_info in models.items(): | |
for file in (result_dir / model_id).glob("*.json"): | |
model_df = pd.DataFrame([json.load(open(file))]) | |
# Sanity checks and standardization of schema values. | |
assert model_df["Model"].iloc[0] == model_id | |
for key, val in zip(self.schema.keys(), choice): | |
assert ( | |
str(val).lower() in str(model_df[key].iloc[0]).lower() | |
) | |
model_df[key] = val | |
# Format the model name as an HTML anchor. | |
model_df["Model"] = self._wrap_model_name(model_info["url"], model_info["nickname"]) | |
model_df["Total params"] = model_info["total_params"] | |
model_df["Denoising params"] = model_info["denoising_params"] | |
model_df["Resolution"] = model_info["resolution"] | |
res_df = pd.concat([res_df, model_df]) | |
if res_df.empty: | |
raise ValueError( | |
f"No benchmark JSON files were read from {self.data_dir=}." | |
) | |
# Order columns | |
columns = res_df.columns.to_list() | |
cols_to_order = ["Model", "Denoising params", "Total params"] | |
cols_to_order.extend(self.schema.keys()) | |
columns = cols_to_order + [col for col in columns if col not in cols_to_order] | |
res_df = res_df[columns] | |
# Order rows | |
res_df = res_df.sort_values(by=["Model", *self.schema.keys(), self.energy_col]) | |
self.full_df = res_df.round(2) | |
# We need to set the default view separately when `gr.State` is forked. | |
self.set_filter_get_df(detail_mode=False) | |
def get_benchmark_checkboxes(self) -> dict[str, list[str]]: | |
return self.schema | |
def get_all_models(self) -> list[str]: | |
return self.full_df["Model"].apply(self._unwrap_model_name).unique().tolist() | |
def set_filter_get_df(self, detail_mode: bool, *filters) -> pd.DataFrame: | |
"""Set the current set of filters and return the filtered DataFrame. | |
Filters can either be completely empty, or be a concatenated list of | |
choices from all checkboxes and all sliders. | |
""" | |
# If the filter is empty, we default to the first choice for each key. | |
if not filters: | |
checkboxes = [choices[:1] for choices in self.schema.values()] | |
sliders = [slider[3] for slider in self.get_benchmark_sliders().values()] | |
filters = checkboxes + sliders | |
index = np.full(len(self.full_df), True) | |
# Checkboxes | |
for setup, choice in zip(self.schema, filters): | |
index = index & self.full_df[setup].isin(choice) | |
cur_df = self.full_df.loc[index] | |
# Sliders (We just have Batch latency for now.) | |
# For each `Model`, we want to first filter out rows whose `Batch latency (s)` is greater than the slider value. | |
# Finally, only just leave the row whose `Energy/image (J)` or `Energy/video (J)` is the smallest. | |
batch_latency = filters[-1] | |
cur_df = ( | |
cur_df | |
.groupby("Model")[cur_df.columns] | |
.apply( | |
lambda x: x[x["Batch latency (s)"] <= batch_latency], | |
include_groups=True, | |
) | |
.sort_values(by=self.energy_col) | |
.reset_index(drop=True) | |
.groupby("Model") | |
.head(1) | |
) | |
if not detail_mode: | |
core_columns = ["Model", "Denoising params", "GPU", "Resolution", "Frames", self.energy_col] | |
readable_name_mapping = { | |
"Denoising params": "Denoising parameters (Billions)", | |
"GPU": "GPU model", | |
self.energy_col: self.energy_col_readable, | |
} | |
for column in cur_df.columns: | |
if column not in core_columns: | |
cur_df = cur_df.drop(column, axis=1) | |
cur_df = cur_df.rename(columns=readable_name_mapping) | |
return cur_df | |
class DiffusionT2ITableManager(DiffusionTableManager): | |
"""Diffusion table manager for text-to-image tasks.""" | |
def get_tab_name(self) -> str: | |
return "Diffusion Text to image" | |
def get_intro_text(self) -> str: | |
text = """ | |
<h2>How much energy do GenAI models consume?</h2> | |
<h3>Diffusion text-to-image generation</h3> | |
<p style="font-size: 16px"> | |
Diffusion models generate images that align with input text prompts. | |
Using <a href="https://ml.energy/zeus">Zeus</a> for energy measurement, we created a leaderboard for the energy consumption of Diffusion text-to-image. | |
</p> | |
<p style="font-size: 16px"> | |
More models will be added over time. Stay tuned! | |
</p> | |
""" | |
return text | |
def get_detail_text(self, detail_mode: bool) -> str: | |
if detail_mode: | |
text = """ | |
Each row corresponds to one model, given a constraint on the maximum computation time for the whole batch. | |
If more than one GPU types were chosen, the row shows results from the GPU with the lowest energy consumption per image. | |
Columns | |
- **Model**: The name of the model. | |
- **Denoising params**: Number of parameters in the denosing module (e.g., UNet, Transformer). | |
- **Total params**: Total number of parameters in the model, including encoders and decoders. | |
- **GPU**: Name of the GPU model used for benchmarking. | |
- **Energy/image (J)**: Energy consumed per generated image in Joules. | |
- **Batch latency (s)**: Time taken to generate a batch of images in seconds. | |
- **Batch size**: Number of prompts/images in a batch. | |
- **Denoising steps**: Number of denoising steps used for the diffusion model. | |
- **Resolution**: Resolution of the generated image. | |
For more detailed information, please take a look at the **About** tab. | |
""" | |
else: | |
text = """ | |
Columns | |
- **Model**: The name of the model. | |
- **Denoising parameters (Billions)**: Number of parameters in the diffusion model's (core) denoising module. This part of the model is run repetitively to generate gradually refine the image. | |
- **GPU model**: Name of the GPU model used for benchmarking. | |
- **Energy per image (Joules)**: Energy consumed for each generated image in Joules. | |
- **Resolution**: Resolution of the generated image. | |
Checking "Show more technical details" above the table will reveal more detailed columns. | |
Also, for more detailed information, please take a look at the **About** tab. | |
""" | |
return text | |
def get_benchmark_sliders(self) -> dict[str, tuple[float, float, float, float]]: | |
return {"Batch latency (s)": (0.0, 60.0, 1.0, 10.0)} | |
class DiffusionT2VTableManager(DiffusionTableManager): | |
"""Diffusion table manager for text-to-video tasks.""" | |
def get_tab_name(self) -> str: | |
return "Diffusion Text to video" | |
def get_intro_text(self) -> str: | |
text = """ | |
<h2>How much energy do GenAI models consume?</h2> | |
<h3>Diffusion text-to-video generation</h3> | |
<p style="font-size: 16px"> | |
Diffusion models generate videos that align with input text prompts. | |
Using <a href="https://ml.energy/zeus">Zeus</a> for energy measurement, we created a leaderboard for the energy consumption of Diffusion text-to-video. | |
</p> | |
<p style="font-size: 16px"> | |
More models will be added over time. Stay tuned! | |
</p> | |
""" | |
return text | |
def get_detail_text(self, detail_mode: bool) -> str: | |
if detail_mode: | |
text = """ | |
Each row corresponds to one model, given a constraint on the maximum computation time for the whole batch. | |
If more than one GPU types were chosen, the row shows results from the GPU with the lowest energy consumption per video. | |
Columns | |
- **Model**: The name of the model. | |
- **Denoising params**: Number of parameters in the denosing module (e.g., UNet, Transformer). | |
- **Total params**: Total number of parameters in the model, including encoders and decoders. | |
- **GPU**: Name of the GPU model used for benchmarking. | |
- **Energy/video (J)**: Energy consumed per generated video in Joules. | |
- **Batch latency (s)**: Time taken to generate a batch of videos in seconds. | |
- **Batch size**: Number of prompts/videos in a batch. | |
- **Denoising steps**: Number of denoising steps used for the diffusion model. | |
- **Frames**: Number of frames in the generated video. | |
- **Resolution**: Resolution of the generated video. | |
For more detailed information, please take a look at the **About** tab. | |
""" | |
else: | |
text = """ | |
Columns | |
- **Model**: The name of the model. | |
- **Denoising parameters (Billions)**: Number of parameters in the diffusion model's (core) denoising module. This part of the model is run repetitively to generate gradually refine the video. | |
- **GPU model**: Name of the GPU model used for benchmarking. | |
- **Energy per video (Joules)**: Energy consumed for each generated image in Joules. | |
- **Frames**: Number of frames in the generated video. | |
- **Resolution**: Resolution of the generated video. | |
Checking "Show more technical details" above the table will reveal more detailed columns. | |
Also, for more detailed information, please take a look at the **About** tab. | |
""" | |
return text | |
def get_benchmark_sliders(self) -> dict[str, tuple[float, float, float, float]]: | |
return {"Batch latency (s)": (0.0, 60.0, 1.0, 10.0)} | |
class DiffusionI2VTableManager(DiffusionTableManager): | |
"""Diffusion table manager for image-to-video tasks.""" | |
def get_tab_name(self) -> str: | |
return "Diffusion Image to video" | |
def get_intro_text(self) -> str: | |
text = """ | |
<h2>How much energy do GenAI models consume?</h2> | |
<h3>Diffusion image-to-video generation</h3> | |
<p style="font-size: 16px"> | |
Diffusion models generate videos given an input image (and sometimes alongside with text). | |
Using <a href="https://ml.energy/zeus">Zeus</a> for energy measurement, we created a leaderboard for the energy consumption of Diffusion image-to-video. | |
</p> | |
<p style="font-size: 16px"> | |
More models will be added over time. Stay tuned! | |
</p> | |
""" | |
return text | |
def get_detail_text(self, detail_mode: bool) -> str: | |
if detail_mode: | |
text = """ | |
Each row corresponds to one model, given a constraint on the maximum computation time for the whole batch. | |
If more than one GPU types were chosen, the row shows results from the GPU with the lowest energy consumption per video. | |
Columns | |
- **Model**: The name of the model. | |
- **Denoising params**: Number of parameters in the denosing module (e.g., UNet, Transformer). | |
- **Total params**: Total number of parameters in the model, including encoders and decoders. | |
- **GPU**: Name of the GPU model used for benchmarking. | |
- **Energy/video (J)**: Energy consumed per generated video in Joules. | |
- **Batch latency (s)**: Time taken to generate a batch of videos in seconds. | |
- **Batch size**: Number of prompts/videos in a batch. | |
- **Denoising steps**: Number of denoising steps used for the diffusion model. | |
- **Frames**: Number of frames in the generated video. | |
- **Resolution**: Resolution of the generated video. | |
For more detailed information, please take a look at the **About** tab. | |
""" | |
else: | |
text = """ | |
Columns | |
- **Model**: The name of the model. | |
- **Denoising parameters (Billions)**: Number of parameters in the diffusion model's (core) denoising module. This part of the model is run repetitively to generate gradually refine the video. | |
- **GPU model**: Name of the GPU model used for benchmarking. | |
- **Energy per video (Joules)**: Energy consumed for each generated image in Joules. | |
- **Frames**: Number of frames in the generated video. | |
- **Resolution**: Resolution of the generated video. | |
Checking "Show more technical details" above the table will reveal more detailed columns. | |
Also, for more detailed information, please take a look at the **About** tab. | |
""" | |
return text | |
def get_benchmark_sliders(self) -> dict[str, tuple[float, float, float, float]]: | |
return {"Batch latency (s)": (0.0, 120.0, 1.0, 60.0)} | |
class LegacyTableManager: | |
def __init__(self, data_dir: str) -> None: | |
"""Load the legacy LLM leaderboard data from CSV files in data_dir. | |
Inside `data_dir`, there should be: | |
- `models.json`: a JSON file containing information about each model. | |
- `schema.yaml`: a YAML file containing the schema of the benchmark. | |
- `score.csv`: a CSV file containing the NLP evaluation metrics of each model. | |
- `*_benchmark.csv`: CSV files containing the system benchmark results. | |
Especially, the `*_benchmark.csv` files should be named after the | |
parameters used in the benchmark. For example, for the CSV file that | |
contains benchmarking results for A100 and the chat-concise task | |
(see `schema.yaml`) for possible choices, the file should be named | |
`A100_chat-concise_benchmark.csv`. | |
""" | |
# Load and merge CSV files. | |
df = self._read_tables(data_dir) | |
# Add the #params column. | |
models = json.load(open(f"{data_dir}/models.json")) | |
df["parameters"] = df["model"].apply(lambda x: models[x]["params"]) | |
# Make the first column (model) an HTML anchor to the model's website. | |
def format_model_link(model_name: str) -> str: | |
url = models[model_name]["url"] | |
nickname = models[model_name]["nickname"] | |
return ( | |
f'<a style="text-decoration: underline; text-decoration-style: dotted" ' | |
f'target="_blank" href="{url}">{nickname}</a>' | |
) | |
df["model"] = df["model"].apply(format_model_link) | |
# Sort by our 'energy efficiency' score. | |
df = df.sort_values(by="energy", ascending=True) | |
# The full table where all the data are. | |
self.full_df = df | |
# Default view of the table is to only show the first options. | |
self.set_filter_get_df() | |
def _read_tables(self, data_dir: str) -> pd.DataFrame: | |
"""Read tables.""" | |
df_score = pd.read_csv(f"{data_dir}/score.csv") | |
with open(f"{data_dir}/schema.yaml") as file: | |
self.schema: dict[str, list] = yaml.safe_load(file) | |
res_df = pd.DataFrame() | |
# Do a cartesian product of all the choices in the schema | |
# and try to read the corresponding CSV files. | |
for choice in itertools.product(*self.schema.values()): | |
filepath = f"{data_dir}/{'_'.join(choice)}_benchmark.csv" | |
with contextlib.suppress(FileNotFoundError): | |
df = pd.read_csv(filepath) | |
for key, val in zip(self.schema.keys(), choice): | |
df.insert(1, key, val) | |
res_df = pd.concat([res_df, df]) | |
if res_df.empty: | |
raise ValueError(f"No benchmark CSV files were read from {data_dir=}.") | |
df = pd.merge(res_df, df_score, on=["model"]).round(2) | |
# Order columns. | |
columns = df.columns.to_list() | |
cols_to_order = ["model"] | |
cols_to_order.extend(self.schema.keys()) | |
cols_to_order.append("energy") | |
columns = cols_to_order + [col for col in columns if col not in cols_to_order] | |
df = df[columns] | |
# Delete rows with *any* NaN values. | |
df = df.dropna() | |
return df | |
def _format_msg(self, text: str) -> str: | |
"""Formats into HTML that prints in Monospace font.""" | |
return f"<pre style='font-family: monospace'>{text}</pre>" | |
def get_dropdown(self): | |
columns = self.full_df.columns.tolist()[1:] | |
return [ | |
gr.Dropdown(choices=columns, value="parameters", label="X"), | |
gr.Dropdown(choices=columns, value="energy", label="Y"), | |
gr.Dropdown(choices=["None", *columns], label="Z (optional)"), | |
] | |
def update_dropdown(self): | |
columns = self.full_df.columns.tolist()[1:] | |
return [ | |
gr.Dropdown.update(choices=columns), | |
gr.Dropdown.update(choices=columns), | |
gr.Dropdown.update(choices=["None", *columns]), | |
] | |
def set_filter_get_df(self, *filters) -> pd.DataFrame: | |
"""Set the current set of filters and return the filtered DataFrame.""" | |
# If the filter is empty, we default to the first choice for each key. | |
if not filters: | |
filters = [choices[:1] for choices in self.schema.values()] | |
index = np.full(len(self.full_df), True) | |
for setup, choice in zip(self.schema, filters): | |
index = index & self.full_df[setup].isin(choice) | |
self.cur_df = self.full_df.loc[index] | |
self.cur_index = index | |
return self.cur_df | |
def get_intro_text(self) -> str: | |
"""Return the leaderboard's introduction text in HTML.""" | |
return """ | |
<div align="center"> | |
<h2 style="color: #23d175">This is the legacy ML.ENERGY LLM leaderboard. This will be removed at the end of this year.</h2> | |
</div> | |
<h3>How much energy do modern Large Language Models (LLMs) consume for inference?</h3> | |
<p style="font-size: 16px"> | |
We used <a href="https://ml.energy/zeus">Zeus</a> to benchmark various open source LLMs in terms of how much time and energy they consume for inference. | |
</p> | |
<p style="font-size: 16px"> | |
For more detailed information, please take a look at the <b>About</b> tab. | |
Every benchmark is limited in some sense -- Before you interpret the results, please take a look at the <b>Limitations</b> section there, too. | |
</p> | |
""" | |
# The global instance of the TableManager should only be used when | |
# initializing components in the Gradio interface. If the global instance | |
# is mutated while handling user sessions, the change will be reflected | |
# in every user session. Instead, the instance provided by gr.State should | |
# be used. | |
global_ltbm = LegacyTableManager("data/legacy") | |
global_tbms = [ | |
LLMChatTableManager("data/llm_text_generation/chat", "Chat"), | |
LLMCodeTableManager("data/llm_text_generation/code", "Code"), | |
VLMChatTableManager("data/mllm_text_generation/chat", "Visual chat"), | |
DiffusionT2ITableManager("data/diffusion/text-to-image", "Text to image"), | |
DiffusionT2VTableManager("data/diffusion/text-to-video", "Text to video"), | |
DiffusionI2VTableManager("data/diffusion/image-to-video", "Image to video"), | |
] | |
# Custom JS. | |
# XXX: This is a hack to make the model names clickable. | |
# Ideally, we should set `datatype` in the constructor of `gr.DataFrame` to | |
# `["markdown"] + ["number"] * (len(df.columns) - 1)` and format models names | |
# as an HTML <a> tag. However, because we also want to dynamically add new | |
# columns to the table and Gradio < 4.0 does not support updating `datatype` with | |
# `gr.DataFrame.update` yet, we need to manually walk into the DOM and replace | |
# the innerHTML of the model name cells with dynamically interpreted HTML. | |
# Desired feature tracked at https://github.com/gradio-app/gradio/issues/3732 | |
dataframe_update_js = f""" | |
function format_model_link() {{ | |
// Iterate over the cells of the first column of the leaderboard table. | |
var table_element = document.querySelectorAll(".tab-leaderboard"); | |
for (var table of table_element) {{ | |
for (let index = 1; index <= {len(global_ltbm.full_df) + sum(len(tbm.full_df) for tbm in global_tbms)}; index++) {{ | |
// Get the cell from `table`. | |
var cell = table.querySelector(`div > div > div > table > tbody > tr:nth-child(${{index}}) > td:nth-child(1) > div > span`); | |
// var cell = document.querySelector( | |
// `.tab-leaderboard > div > div > div > table > tbody > tr:nth-child(${{index}}) > td:nth-child(1) > div > span` | |
// ); | |
// If nothing was found, it likely means that now the visible table has less rows | |
// than the full table. This happens when the user filters the table. In this case, | |
// we should just return. | |
if (cell == null) break; | |
// This check exists to make this function idempotent. | |
// Multiple changes to the Dataframe component may invoke this function, | |
// multiple times to the same HTML table (e.g., adding and sorting cols). | |
// Thus, we check whether we already formatted the model names by seeing | |
// whether the child of the cell is a text node. If it is not, | |
// it means we already parsed it into HTML, so we should just return. | |
if (cell.firstChild.nodeType != 3) break; | |
// Decode and interpret the innerHTML of the cell as HTML. | |
var decoded_string = new DOMParser().parseFromString(cell.innerHTML, "text/html").documentElement.textContent; | |
var temp = document.createElement("template"); | |
temp.innerHTML = decoded_string; | |
var model_anchor = temp.content.firstChild; | |
// Replace the innerHTML of the cell with the interpreted HTML. | |
cell.replaceChildren(model_anchor); | |
}} | |
}} | |
// Return all arguments as is. | |
return arguments | |
}} | |
""" | |
# Custom CSS. | |
custom_css = """ | |
/* Make ML.ENERGY look like a clickable logo. */ | |
.text-logo { | |
color: #23d175 !important; | |
text-decoration: none !important; | |
} | |
/* Make the submit button the same color as the logo. */ | |
.btn-submit { | |
background: #23d175 !important; | |
color: white !important; | |
border: 0 !important; | |
} | |
/* Center the plotly plot inside its container. */ | |
.plotly > div { | |
margin: auto !important; | |
} | |
/* Limit the width of the first column to 300 px. */ | |
table td:first-child, | |
table th:first-child { | |
max-width: 300px; | |
overflow: auto; | |
white-space: nowrap; | |
} | |
/* Make tab buttons larger */ | |
.tab-nav > button { | |
font-size: 18px !important; | |
} | |
/* Color texts. */ | |
.green-text { | |
color: #23d175 !important; | |
} | |
.red-text { | |
color: #ff3860 !important; | |
} | |
/* Flashing model name borders. */ | |
@keyframes blink { | |
0%, 33%, 67%, 100% { | |
border-color: transparent; | |
} | |
17%, 50%, 83% { | |
border-color: #23d175; | |
} | |
} | |
/* Older browser compatibility */ | |
@-webkit-keyframes blink { | |
0%, 33%, 67%, 100% { | |
border-color: transparent; | |
} | |
17%, 50%, 83% { | |
border-color: #23d175; | |
} | |
} | |
.model-name-text { | |
border: 2px solid transparent; /* Transparent border initially */ | |
animation: blink 3s ease-in-out 1; /* One complete cycle of animation, lasting 3 seconds */ | |
-webkit-animation: blink 3s ease-in-out 1; /* Older browser compatibility */ | |
} | |
/* Grey out components when the Colosseum is down. */ | |
.greyed-out { | |
pointer-events: none; | |
opacity: 0.4; | |
} | |
/* Make the Citation header larger */ | |
#citation-header > div > span { | |
font-size: 16px !important; | |
} | |
/* Align everything in tables to the right. */ | |
/* Not the best solution, but at least makes the numbers align. */ | |
.tab-leaderboard span { | |
text-align: right; | |
} | |
""" | |
# The app will not start without a controller address set. | |
controller_addr = os.environ.get("COLOSSEUM_CONTROLLER_ADDR") | |
if controller_addr is None: | |
COLOSSEUM_UP = False | |
COLOSSEUM_DOWN_MESSAGE = "<br/><h2 style='text-align: center'>Local testing mode. Colosseum disabled.</h2>" | |
controller_addr = "localhost" | |
global_controller_client = ControllerClient(controller_addr=controller_addr, timeout=15) | |
# Fetch the latest update date of the leaderboard repository. | |
resp = requests.get("https://api.github.com/repos/ml-energy/leaderboard/commits/master") | |
if resp.status_code != 200: | |
current_date = "[Failed to fetch]" | |
print("Failed to fetch the latest release date of the leaderboard repository.") | |
print(resp.json()) | |
else: | |
current_datetime = parser.parse(resp.json()["commit"]["author"]["date"]) | |
current_date = current_datetime.astimezone(tz.gettz("US/Eastern")).strftime( | |
"%Y-%m-%d" | |
) | |
# Load the list of models. To reload, the app should be restarted. | |
RANDOM_MODEL_NAME = "Random" | |
RANDOM_USER_PREFERENCE = "Two random models" | |
global_available_models = global_controller_client.get_available_models() if COLOSSEUM_UP else [] | |
model_name_to_user_pref = {model: f"One is {model}" for model in global_available_models} | |
model_name_to_user_pref[RANDOM_MODEL_NAME] = RANDOM_USER_PREFERENCE | |
user_pref_to_model_name = {v: k for k, v in model_name_to_user_pref.items()} | |
# Colosseum helper functions. | |
def enable_interact(num: int): | |
def inner(): | |
return [gr.update(interactive=True)] * num | |
return inner | |
def disable_interact(num: int): | |
def inner(): | |
return [gr.update(interactive=False)] * num | |
return inner | |
def consumed_less_energy_message(energy_a, energy_b): | |
"""Return a message that indicates that the user chose the model that consumed less energy. | |
By default report in "%f %" but if the difference is larger than 2 times, report in "%f X". | |
""" | |
less_energy = min(energy_a, energy_b) | |
more_energy = max(energy_a, energy_b) | |
factor = less_energy / more_energy | |
how_much = f"{1 / factor:.1f}x" if factor <= 0.5 else f"{100 - factor * 100:.1f}%" | |
return f"<h2>That response also <span class='green-text'>consumed {how_much} less energy</span> ({energy_a:,.0f} J vs. {energy_b:,.0f} J)!</h2>" | |
def consumed_more_energy_message(energy_a, energy_b): | |
"""Return a message that indicates that the user chose the model that consumed more energy. | |
By default report in "%f %" but if the difference is larger than 2 times, report in "%f X". | |
""" | |
less_energy = min(energy_a, energy_b) | |
more_energy = max(energy_a, energy_b) | |
factor = more_energy / less_energy | |
how_much = f"{factor:.1f}x" if factor >= 2.0 else f"{factor * 100 - 100:.1f}%" | |
return f"<h2>That response <span class='red-text'>consumed {how_much} more energy</span> ({energy_a:,.0f} J vs. {energy_b:,.0f} J).</h2>" | |
# Colosseum event handlers | |
def on_load(): | |
"""Intialize the dataframe, shuffle the model preference dropdown choices.""" | |
dataframe = global_ltbm.set_filter_get_df() | |
dataframes = [global_tbm.set_filter_get_df(detail_mode=False) for global_tbm in global_tbms] | |
return dataframe, *dataframes | |
def add_prompt_disable_submit(prompt, history_a, history_b): | |
"""Add the user's prompt to the two model's history and disable further submission.""" | |
client = global_controller_client.fork() | |
return [ | |
gr.Textbox.update(value=" ", interactive=False), | |
gr.Button.update(interactive=False), | |
history_a + [[prompt, ""]], | |
history_b + [[prompt, ""]], | |
client, | |
] | |
def generate_responses(client: ControllerClient, history_a, history_b): | |
"""Generate responses for the two models.""" | |
model_preference = RANDOM_MODEL_NAME | |
for resp_a, resp_b in itertools.zip_longest( | |
client.prompt( | |
prompt=history_a[-1][0], index=0, model_preference=model_preference | |
), | |
client.prompt( | |
prompt=history_b[-1][0], index=1, model_preference=model_preference | |
), | |
): | |
if resp_a is not None: | |
history_a[-1][1] += resp_a | |
if resp_b is not None: | |
history_b[-1][1] += resp_b | |
yield [history_a, history_b] | |
def make_resp_vote_func(victory_index: Literal[0, 1]): | |
"""Return a function that will be called when the user clicks on response preference vote buttons.""" | |
def resp_vote_func(client: ControllerClient): | |
vote_response = client.response_vote(victory_index=victory_index) | |
model_name_a, model_name_b = map(lambda n: f"## {n}", vote_response.model_names) | |
energy_a, energy_b = vote_response.energy_consumptions | |
# User liked the model that also consumed less energy. | |
if (victory_index == 0 and energy_a <= energy_b) or (victory_index == 1 and energy_a >= energy_b): | |
energy_message = consumed_less_energy_message(energy_a, energy_b) | |
return [ | |
# Disable response vote buttons | |
gr.Button.update(interactive=False), gr.Button.update(interactive=False), | |
# Reveal model names | |
gr.Markdown.update(model_name_a, visible=True), gr.Markdown.update(model_name_b, visible=True), | |
# Display energy consumption comparison message | |
gr.Markdown.update(energy_message, visible=True), | |
# Keep energy vote buttons hidden | |
gr.Button.update(visible=False, interactive=False), gr.Button.update(visible=False, interactive=False), | |
# Enable reset button | |
gr.Button.update(visible=True, interactive=True), | |
] | |
# User liked the model that consumed more energy. | |
else: | |
energy_message = consumed_more_energy_message(energy_a, energy_b) | |
return [ | |
# Disable response vote buttons | |
gr.Button.update(interactive=False), gr.Button.update(interactive=False), | |
# Leave model names hidden | |
gr.Markdown.update(visible=False), gr.Markdown.update(visible=False), | |
# Display energy consumption comparison message | |
gr.Markdown.update(energy_message, visible=True), | |
# Reveal and enable energy vote buttons | |
gr.Button.update(visible=True, interactive=True), gr.Button.update(visible=True, interactive=True), | |
# Keep the reset button disabled | |
gr.Button.update(visible=False, interactive=False), | |
] | |
return resp_vote_func | |
def make_energy_vote_func(is_worth: bool): | |
"""Return a function that will be called when the user clicks on energy vote buttons.""" | |
def energy_vote_func(client: ControllerClient, energy_message: str): | |
vote_response = client.energy_vote(is_worth=is_worth) | |
model_name_a, model_name_b = map(lambda n: f"## {n}", vote_response.model_names) | |
return [ | |
# Reveal model names | |
gr.Markdown.update(model_name_a, visible=True), gr.Markdown.update(model_name_b, visible=True), | |
# Disable energy vote buttons | |
gr.Button.update(interactive=False), gr.Button.update(interactive=False), | |
# Enable reset button | |
gr.Button.update(interactive=True, visible=True), | |
# Append to the energy comparison message | |
energy_message[:-5] + (" Fair enough.</h2>" if is_worth else " Wasn't worth it.</h2>"), | |
] | |
return energy_vote_func | |
def play_again(): | |
available_models = copy.deepcopy(global_available_models) | |
random.shuffle(available_models) | |
available_models.insert(0, RANDOM_MODEL_NAME) | |
return [ | |
# Clear chatbot history | |
None, None, | |
# Enable prompt textbox and submit button | |
gr.Textbox.update(value="", interactive=True), gr.Button.update(interactive=True), | |
# Mask model names | |
gr.Markdown.update(value="", visible=False), gr.Markdown.update(value="", visible=False), | |
# Hide energy vote buttons and message | |
gr.Button.update(visible=False), gr.Button.update(visible=False), gr.Markdown.update(visible=False), | |
# Disable reset button | |
gr.Button.update(interactive=False, visible=False), | |
] | |
def toggle_detail_mode_slider_visibility(detail_mode: bool, *sliders): | |
return [detail_mode] + [gr.update(visible=detail_mode)] * len(sliders) | |
def toggle_detail_mode_sync_tabs(detail_mode: bool, *checkboxes): | |
return [gr.Checkbox.update(value=detail_mode)] * len(checkboxes) + [gr.Markdown.update(tbm.get_detail_text(detail_mode)) for tbm in global_tbms] | |
focus_prompt_input_js = """ | |
function() { | |
for (let textarea of document.getElementsByTagName("textarea")) { | |
if (textarea.hasAttribute("autofocus")) { | |
textarea.focus(); | |
return; | |
} | |
} | |
} | |
""" | |
with gr.Blocks(css=custom_css) as block: | |
tbm = gr.State(global_ltbm) # type: ignore | |
local_tbms: list[TableManager] = [gr.State(global_tbm) for global_tbm in global_tbms] # type: ignore | |
detail_mode = gr.State(False) # type: ignore | |
with gr.Box(): | |
gr.HTML( | |
"<h1><a href='https://ml.energy' class='text-logo'>ML.ENERGY</a> Leaderboard</h1>" | |
) | |
with gr.Tabs(): | |
# Tab: Colosseum. | |
with gr.Tab("Colosseum ⚔️️"): | |
if COLOSSEUM_UP: | |
gr.Markdown(open("docs/colosseum_top.md").read()) | |
else: | |
gr.HTML(COLOSSEUM_DOWN_MESSAGE) | |
gr.HTML("<h3 style='text-align: center'>The energy leaderboard is still available.</h3><br/>") | |
gr.HTML(COLOSSUMM_YOUTUBE_DEMO_EMBED_HTML) | |
with gr.Group(): | |
with gr.Row(): | |
prompt_input = gr.Textbox( | |
show_label=False, | |
placeholder="Input your prompt, e.g., 'Explain machine learning in simple terms.'", | |
container=False, | |
scale=20, | |
interactive=COLOSSEUM_UP, | |
elem_classes=None if COLOSSEUM_UP else ["greyed-out"], | |
) | |
prompt_submit_btn = gr.Button( | |
value="⚔️️ Fight!", | |
elem_classes=["btn-submit"] if COLOSSEUM_UP else ["greyed-out"], | |
min_width=60, | |
scale=1, | |
interactive=COLOSSEUM_UP, | |
) | |
with gr.Row(): | |
masked_model_names = [] | |
chatbots = [] | |
resp_vote_btn_list: list[gr.component.Component] = [] | |
with gr.Column(): | |
with gr.Row(): | |
masked_model_names.append( | |
gr.Markdown(visible=False, elem_classes=["model-name-text"]) | |
) | |
with gr.Row(): | |
chatbots.append( | |
gr.Chatbot( | |
label="Model A", | |
elem_id="chatbot", | |
height=400, | |
elem_classes=None if COLOSSEUM_UP else ["greyed-out"], | |
) | |
) | |
with gr.Row(): | |
left_resp_vote_btn = gr.Button( | |
value="👈 Model A is better", interactive=False | |
) | |
resp_vote_btn_list.append(left_resp_vote_btn) | |
with gr.Column(): | |
with gr.Row(): | |
masked_model_names.append( | |
gr.Markdown(visible=False, elem_classes=["model-name-text"]) | |
) | |
with gr.Row(): | |
chatbots.append( | |
gr.Chatbot( | |
label="Model B", | |
elem_id="chatbot", | |
height=400, | |
elem_classes=None if COLOSSEUM_UP else ["greyed-out"], | |
) | |
) | |
with gr.Row(): | |
right_resp_vote_btn = gr.Button( | |
value="👉 Model B is better", interactive=False | |
) | |
resp_vote_btn_list.append(right_resp_vote_btn) | |
with gr.Row(): | |
energy_comparison_message = gr.HTML(visible=False) | |
with gr.Row(): | |
worth_energy_vote_btn = gr.Button( | |
value="The better response was worth 👍 the extra energy.", | |
visible=False, | |
) | |
notworth_energy_vote_btn = gr.Button( | |
value="Not really worth that much more. 👎", visible=False | |
) | |
energy_vote_btn_list: list[gr.component.Component] = [ | |
worth_energy_vote_btn, | |
notworth_energy_vote_btn, | |
] | |
with gr.Row(): | |
play_again_btn = gr.Button( | |
"Play again!", visible=False, elem_classes=["btn-submit"] | |
) | |
gr.Markdown(open("docs/colosseum_bottom.md").read()) | |
controller_client = gr.State() | |
(prompt_input | |
.submit(add_prompt_disable_submit, [prompt_input, *chatbots], [prompt_input, prompt_submit_btn, *chatbots, controller_client], queue=False) | |
.then(generate_responses, [controller_client, *chatbots], [*chatbots], queue=True, show_progress="hidden") | |
.then(enable_interact(2), None, resp_vote_btn_list, queue=False)) | |
(prompt_submit_btn | |
.click(add_prompt_disable_submit, [prompt_input, *chatbots], [prompt_input, prompt_submit_btn, *chatbots, controller_client], queue=False) | |
.then(generate_responses, [controller_client, *chatbots], [*chatbots], queue=True, show_progress="hidden") | |
.then(enable_interact(2), None, resp_vote_btn_list, queue=False)) | |
left_resp_vote_btn.click( | |
make_resp_vote_func(victory_index=0), | |
[controller_client], | |
[*resp_vote_btn_list, *masked_model_names, energy_comparison_message, *energy_vote_btn_list, play_again_btn], | |
queue=False, | |
) | |
right_resp_vote_btn.click( | |
make_resp_vote_func(victory_index=1), | |
[controller_client], | |
[*resp_vote_btn_list, *masked_model_names, energy_comparison_message, *energy_vote_btn_list, play_again_btn], | |
queue=False, | |
) | |
worth_energy_vote_btn.click( | |
make_energy_vote_func(is_worth=True), | |
[controller_client, energy_comparison_message], | |
[*masked_model_names, *energy_vote_btn_list, play_again_btn, energy_comparison_message], | |
queue=False, | |
) | |
notworth_energy_vote_btn.click( | |
make_energy_vote_func(is_worth=False), | |
[controller_client, energy_comparison_message], | |
[*masked_model_names, *energy_vote_btn_list, play_again_btn, energy_comparison_message], | |
queue=False, | |
) | |
(play_again_btn | |
.click( | |
play_again, | |
None, | |
[*chatbots, prompt_input, prompt_submit_btn, *masked_model_names, *energy_vote_btn_list, energy_comparison_message, play_again_btn], | |
queue=False, | |
) | |
.then(None, _js=focus_prompt_input_js, queue=False)) | |
# Tab: Leaderboards. | |
dataframes = [] | |
all_detail_mode_checkboxes = [] | |
all_sliders = [] | |
all_detail_text_components = [] | |
for global_tbm, local_tbm in zip(global_tbms, local_tbms): | |
with gr.Tab(global_tbm.get_tab_name()): | |
# Box: Introduction text. | |
with gr.Box(): | |
gr.Markdown(global_tbm.get_intro_text()) | |
# Block: Checkboxes and sliders to select benchmarking parameters. A detail mode checkbox. | |
with gr.Row(): | |
checkboxes: list[gr.CheckboxGroup] = [] | |
for key, choices in global_tbm.get_benchmark_checkboxes().items(): | |
# Check the first element by default. | |
checkboxes.append(gr.CheckboxGroup(choices=choices, value=choices[:1], label=key)) | |
sliders: list[gr.Slider] = [] | |
for key, (min_val, max_val, step, default) in global_tbm.get_benchmark_sliders().items(): | |
sliders.append(gr.Slider(minimum=min_val, maximum=max_val, value=default, step=step, label=key, visible=detail_mode.value)) | |
all_sliders.extend(sliders) | |
with gr.Row(): | |
detail_mode_checkbox = gr.Checkbox(label="Show more technical details", value=False) | |
all_detail_mode_checkboxes.append(detail_mode_checkbox) | |
# Block: Leaderboard table. | |
with gr.Row(): | |
dataframe = gr.Dataframe( | |
type="pandas", | |
elem_classes=["tab-leaderboard"], | |
interactive=False, | |
max_rows=1000, | |
) | |
dataframes.append(dataframe) | |
# Make sure the models have clickable links. | |
dataframe.change( | |
None, None, None, _js=dataframe_update_js, queue=False | |
) | |
# Table automatically updates when users check or uncheck any checkbox or move any slider. | |
for element in [detail_mode_checkbox, *checkboxes, *sliders]: | |
element.change( | |
global_tbm.__class__.set_filter_get_df, | |
inputs=[local_tbm, detail_mode, *checkboxes, *sliders], | |
outputs=dataframe, | |
queue=False, | |
) | |
# Block: More details about the leaderboard. | |
with gr.Box(): | |
detail_text = global_tbm.get_detail_text(detail_mode=False) | |
all_detail_text_components.append(gr.Markdown(detail_text)) | |
# Block: Leaderboard date. | |
with gr.Row(): | |
gr.HTML( | |
f"<h3 style='color: gray'>Last updated: {current_date}</h3>" | |
) | |
# Tab: Legacy leaderboard. | |
with gr.Tab("LLM Leaderboard (legacy)"): | |
with gr.Box(): | |
gr.Markdown(global_ltbm.get_intro_text()) | |
# Block: Checkboxes to select benchmarking parameters. | |
with gr.Row(): | |
with gr.Box(): | |
gr.Markdown("### Benchmark results to show") | |
checkboxes: list[gr.CheckboxGroup] = [] | |
for key, choices in global_ltbm.schema.items(): | |
# Specifying `value` makes everything checked by default. | |
checkboxes.append( | |
gr.CheckboxGroup( | |
choices=choices, value=choices[:1], label=key | |
) | |
) | |
# Block: Leaderboard table. | |
with gr.Row(): | |
dataframe = gr.Dataframe( | |
type="pandas", elem_classes=["tab-leaderboard"], interactive=False | |
) | |
# Make sure the models have clickable links. | |
dataframe.change(None, None, None, _js=dataframe_update_js, queue=False) | |
# Table automatically updates when users check or uncheck any checkbox. | |
for checkbox in checkboxes: | |
checkbox.change( | |
LegacyTableManager.set_filter_get_df, | |
inputs=[tbm, *checkboxes], | |
outputs=dataframe, | |
queue=False, | |
) | |
# Block: Leaderboard date. | |
with gr.Row(): | |
gr.HTML(f"<h3 style='color: gray'>Last updated: {current_date}</h3>") | |
# Tab: About page. | |
with gr.Tab("About"): | |
gr.Markdown(open("docs/about.md").read()) | |
# Detail mode toggling. | |
for detail_mode_checkbox in all_detail_mode_checkboxes: | |
detail_mode_checkbox.change( | |
toggle_detail_mode_slider_visibility, | |
inputs=[detail_mode_checkbox, *all_sliders], | |
outputs=[detail_mode, *all_sliders], | |
queue=False, | |
) | |
detail_mode_checkbox.change( | |
toggle_detail_mode_sync_tabs, | |
inputs=[detail_mode_checkbox, *all_detail_mode_checkboxes], | |
outputs=[*all_detail_mode_checkboxes, *all_detail_text_components], | |
queue=False, | |
) | |
# Citation | |
with gr.Accordion("📚 Citation", open=False, elem_id="citation-header"): | |
citation_text = open("docs/citation.bib").read() | |
gr.Textbox( | |
value=citation_text, | |
label="BibTeX for the leaderboard and the Zeus framework used for benchmarking:", | |
lines=len(list(filter(lambda c: c == "\n", citation_text))), | |
interactive=False, | |
show_copy_button=True, | |
) | |
# Load the table on page load. | |
block.load( | |
on_load, | |
outputs=[dataframe, *dataframes], | |
queue=False, | |
) | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser() | |
parser.add_argument( | |
"--share", action="store_true", help="Specify if sharing is enabled" | |
) | |
parser.add_argument("--concurrency", type=int, default=50) | |
args = parser.parse_args() | |
block.queue(concurrency_count=args.concurrency, api_open=False).launch( | |
share=args.share, show_error=True | |
) | |