Spaces:
Running
Running
Jae-Won Chung
commited on
Commit
•
e9693d3
1
Parent(s):
b5a071f
No composite metrics by default, add examples
Browse files- LEADERBOARD.md +0 -8
- app.py +27 -26
LEADERBOARD.md
CHANGED
@@ -1,18 +1,10 @@
|
|
1 |
The goal of the ML.ENERGY Leaderboard is to give people a sense of how much **energy** LLMs would consume.
|
2 |
|
3 |
-
## How is energy different?
|
4 |
-
|
5 |
-
The energy consumption of running inference depends on factors such as model architecture, size, and GPU model.
|
6 |
-
However, even if we run models with the exact same architecture and size on the same GPU, the average energy consumption **per prompt** is different because different models have **different verbosity**.
|
7 |
-
That is, when asked the same thing, different models answer in different lengths.
|
8 |
-
|
9 |
## Columns
|
10 |
|
11 |
- `gpu`: NVIDIA GPU model name. Note that NLP evaluation was only run once on our A40 GPUs, so this column only changes system-level measurements like latency and energy.
|
12 |
- `task`: Name of the task. See *Tasks* below for details.
|
13 |
-
- `energy_eff`: Our definition of energy efficiency: Average NLP evaluation metric attained per Joule of energy (`nlp_average / energy`).
|
14 |
- `energy` (J): The average energy consumed by the model to generate a response.
|
15 |
-
- `nlp_average`: The arithmetic average of the NLP evaluation metrics we obtained. See *NLP evaluation metrics* below for details.
|
16 |
- `throughput` (token/s): The average number of tokens generated per second.
|
17 |
- `latency` (s): The average time it took for the model to generate a response.
|
18 |
- `response_length` (token): The average number of tokens in the model's response.
|
|
|
1 |
The goal of the ML.ENERGY Leaderboard is to give people a sense of how much **energy** LLMs would consume.
|
2 |
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
## Columns
|
4 |
|
5 |
- `gpu`: NVIDIA GPU model name. Note that NLP evaluation was only run once on our A40 GPUs, so this column only changes system-level measurements like latency and energy.
|
6 |
- `task`: Name of the task. See *Tasks* below for details.
|
|
|
7 |
- `energy` (J): The average energy consumed by the model to generate a response.
|
|
|
8 |
- `throughput` (token/s): The average number of tokens generated per second.
|
9 |
- `latency` (s): The average time it took for the model to generate a response.
|
10 |
- `response_length` (token): The average number of tokens in the model's response.
|
app.py
CHANGED
@@ -35,7 +35,7 @@ class TableManager:
|
|
35 |
df["model"] = df["model"].apply(format_model_link)
|
36 |
|
37 |
# Sort by our 'energy efficiency' score.
|
38 |
-
df = df.sort_values(by="
|
39 |
|
40 |
# The full table where all the data are.
|
41 |
self.full_df = df
|
@@ -47,11 +47,6 @@ class TableManager:
|
|
47 |
"""Read tables."""
|
48 |
df_score = pd.read_csv(f"{data_dir}/score.csv")
|
49 |
|
50 |
-
# Compute average NLP metrics
|
51 |
-
columns = df_score.columns.to_list()
|
52 |
-
columns.remove("model")
|
53 |
-
df_score["nlp_average"] = df_score[columns].mean(axis=1)
|
54 |
-
|
55 |
with open(f"{data_dir}/schema.yaml") as file:
|
56 |
self.schema: dict[str, list] = yaml.safe_load(file)
|
57 |
|
@@ -71,16 +66,12 @@ class TableManager:
|
|
71 |
raise ValueError(f"No benchmark CSV files were read from {data_dir=}.")
|
72 |
|
73 |
df = pd.merge(res_df, df_score, on=["model"]).round(2)
|
74 |
-
|
75 |
-
# Energy efficiency is defined as the amount of average NLP performance
|
76 |
-
# the model gets per Joule of energy.
|
77 |
-
df["energy_eff"] = (df["nlp_average"] / df["energy"]).round(4)
|
78 |
|
79 |
# Order columns.
|
80 |
columns = df.columns.to_list()
|
81 |
cols_to_order = ["model"]
|
82 |
cols_to_order.extend(self.schema.keys())
|
83 |
-
cols_to_order.
|
84 |
columns = cols_to_order + [col for col in columns if col not in cols_to_order]
|
85 |
df = df[columns]
|
86 |
|
@@ -118,10 +109,19 @@ class TableManager:
|
|
118 |
|
119 |
# Evaluate the formula and catch any error.
|
120 |
try:
|
121 |
-
|
122 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
123 |
col = col.round(2)
|
124 |
-
self.full_df
|
|
|
|
|
|
|
125 |
except Exception as exc:
|
126 |
return self.cur_df, self._format_msg(f"Invalid formula: {exc}")
|
127 |
|
@@ -132,8 +132,8 @@ class TableManager:
|
|
132 |
def get_dropdown(self):
|
133 |
columns = self.full_df.columns.tolist()[1:]
|
134 |
return [
|
135 |
-
gr.Dropdown(
|
136 |
-
gr.Dropdown(
|
137 |
gr.Dropdown(choices=["None", *columns], label="Z (optional)"),
|
138 |
]
|
139 |
|
@@ -303,8 +303,8 @@ with block:
|
|
303 |
with gr.Row():
|
304 |
with gr.Column(scale=3):
|
305 |
with gr.Row():
|
306 |
-
colname_input = gr.Textbox(
|
307 |
-
formula_input = gr.Textbox(
|
308 |
with gr.Column(scale=1):
|
309 |
with gr.Row():
|
310 |
add_col_btn = gr.Button("Add to table (⏎)", elem_classes=["btn-submit"])
|
@@ -312,6 +312,14 @@ with block:
|
|
312 |
clear_input_btn = gr.Button("Clear")
|
313 |
with gr.Row():
|
314 |
add_col_message = gr.HTML("")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
315 |
colname_input.submit(
|
316 |
TableManager.add_column,
|
317 |
inputs=[tbm, colname_input, formula_input],
|
@@ -349,14 +357,7 @@ with block:
|
|
349 |
plot_width_input = gr.Textbox("600", lines=1, label="Width (px)")
|
350 |
plot_height_input = gr.Textbox("600", lines=1, label="Height (px)")
|
351 |
with gr.Row():
|
352 |
-
|
353 |
-
plot = gr.Plot(global_tbm.plot_scatter(
|
354 |
-
width=plot_width_input.value,
|
355 |
-
height=plot_height_input.value,
|
356 |
-
x=axis_dropdowns[0].value,
|
357 |
-
y=axis_dropdowns[1].value,
|
358 |
-
z=axis_dropdowns[2].value,
|
359 |
-
)[0])
|
360 |
with gr.Row():
|
361 |
plot_message = gr.HTML("")
|
362 |
add_col_btn.click(TableManager.update_dropdown, inputs=tbm, outputs=axis_dropdowns) # type: ignore
|
|
|
35 |
df["model"] = df["model"].apply(format_model_link)
|
36 |
|
37 |
# Sort by our 'energy efficiency' score.
|
38 |
+
df = df.sort_values(by="energy", ascending=True)
|
39 |
|
40 |
# The full table where all the data are.
|
41 |
self.full_df = df
|
|
|
47 |
"""Read tables."""
|
48 |
df_score = pd.read_csv(f"{data_dir}/score.csv")
|
49 |
|
|
|
|
|
|
|
|
|
|
|
50 |
with open(f"{data_dir}/schema.yaml") as file:
|
51 |
self.schema: dict[str, list] = yaml.safe_load(file)
|
52 |
|
|
|
66 |
raise ValueError(f"No benchmark CSV files were read from {data_dir=}.")
|
67 |
|
68 |
df = pd.merge(res_df, df_score, on=["model"]).round(2)
|
|
|
|
|
|
|
|
|
69 |
|
70 |
# Order columns.
|
71 |
columns = df.columns.to_list()
|
72 |
cols_to_order = ["model"]
|
73 |
cols_to_order.extend(self.schema.keys())
|
74 |
+
cols_to_order.append("energy")
|
75 |
columns = cols_to_order + [col for col in columns if col not in cols_to_order]
|
76 |
df = df[columns]
|
77 |
|
|
|
109 |
|
110 |
# Evaluate the formula and catch any error.
|
111 |
try:
|
112 |
+
# Give the users some helper functions that can be used in the formula
|
113 |
+
# like "@sum(response_length)".
|
114 |
+
col = self.full_df.eval(
|
115 |
+
formula,
|
116 |
+
local_dict={"sum": sum, "len": len, "max": max, "min": min},
|
117 |
+
)
|
118 |
+
# Only round floating point columns.
|
119 |
+
if isinstance(col, pd.Series) and col.dtype.kind == "f":
|
120 |
col = col.round(2)
|
121 |
+
if column_name in self.full_df.columns:
|
122 |
+
self.full_df[column_name] = col
|
123 |
+
else:
|
124 |
+
self.full_df.insert(len(self.schema) + 1, column_name, col)
|
125 |
except Exception as exc:
|
126 |
return self.cur_df, self._format_msg(f"Invalid formula: {exc}")
|
127 |
|
|
|
132 |
def get_dropdown(self):
|
133 |
columns = self.full_df.columns.tolist()[1:]
|
134 |
return [
|
135 |
+
gr.Dropdown(choices=columns, label="X"),
|
136 |
+
gr.Dropdown(choices=columns, label="Y"),
|
137 |
gr.Dropdown(choices=["None", *columns], label="Z (optional)"),
|
138 |
]
|
139 |
|
|
|
303 |
with gr.Row():
|
304 |
with gr.Column(scale=3):
|
305 |
with gr.Row():
|
306 |
+
colname_input = gr.Textbox(lines=1, label="Custom column name")
|
307 |
+
formula_input = gr.Textbox(lines=1, label="Formula (@sum, @len, @max, and @min are supported)")
|
308 |
with gr.Column(scale=1):
|
309 |
with gr.Row():
|
310 |
add_col_btn = gr.Button("Add to table (⏎)", elem_classes=["btn-submit"])
|
|
|
312 |
clear_input_btn = gr.Button("Clear")
|
313 |
with gr.Row():
|
314 |
add_col_message = gr.HTML("")
|
315 |
+
gr.Examples(
|
316 |
+
examples=[
|
317 |
+
["power", "energy / latency"],
|
318 |
+
["token_per_joule", "response_length / energy"],
|
319 |
+
["verbose", "response_length > @sum(response_length) / @len(response_length)"],
|
320 |
+
],
|
321 |
+
inputs=[colname_input, formula_input],
|
322 |
+
)
|
323 |
colname_input.submit(
|
324 |
TableManager.add_column,
|
325 |
inputs=[tbm, colname_input, formula_input],
|
|
|
357 |
plot_width_input = gr.Textbox("600", lines=1, label="Width (px)")
|
358 |
plot_height_input = gr.Textbox("600", lines=1, label="Height (px)")
|
359 |
with gr.Row():
|
360 |
+
plot = gr.Plot()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
361 |
with gr.Row():
|
362 |
plot_message = gr.HTML("")
|
363 |
add_col_btn.click(TableManager.update_dropdown, inputs=tbm, outputs=axis_dropdowns) # type: ignore
|