Spaces:

ml-energy
/

leaderboard

Running

App Files Files Community

Jae-Won Chung commited on Jul 2, 2023

Commit

e9693d3

•

1 Parent(s): b5a071f

No composite metrics by default, add examples

Browse files

Files changed (2) hide show

LEADERBOARD.md +0 -8
app.py +27 -26

LEADERBOARD.md CHANGED Viewed

@@ -1,18 +1,10 @@
 The goal of the ML.ENERGY Leaderboard is to give people a sense of how much **energy** LLMs would consume.
-## How is energy different?
-The energy consumption of running inference depends on factors such as model architecture, size, and GPU model.
-However, even if we run models with the exact same architecture and size on the same GPU, the average energy consumption **per prompt** is different because different models have **different verbosity**.
-That is, when asked the same thing, different models answer in different lengths.
 ## Columns
 - `gpu`: NVIDIA GPU model name. Note that NLP evaluation was only run once on our A40 GPUs, so this column only changes system-level measurements like latency and energy.
 - `task`: Name of the task. See *Tasks* below for details.
-- `energy_eff`: Our definition of energy efficiency: Average NLP evaluation metric attained per Joule of energy (`nlp_average / energy`).
 - `energy` (J): The average energy consumed by the model to generate a response.
-- `nlp_average`: The arithmetic average of the NLP evaluation metrics we obtained. See *NLP evaluation metrics* below for details.
 - `throughput` (token/s): The average number of tokens generated per second.
 - `latency` (s): The average time it took for the model to generate a response.
 - `response_length` (token): The average number of tokens in the model's response.

 The goal of the ML.ENERGY Leaderboard is to give people a sense of how much **energy** LLMs would consume.
 ## Columns
 - `gpu`: NVIDIA GPU model name. Note that NLP evaluation was only run once on our A40 GPUs, so this column only changes system-level measurements like latency and energy.
 - `task`: Name of the task. See *Tasks* below for details.
 - `energy` (J): The average energy consumed by the model to generate a response.
 - `throughput` (token/s): The average number of tokens generated per second.
 - `latency` (s): The average time it took for the model to generate a response.
 - `response_length` (token): The average number of tokens in the model's response.

app.py CHANGED Viewed

@@ -35,7 +35,7 @@ class TableManager:
         df["model"] = df["model"].apply(format_model_link)
         # Sort by our 'energy efficiency' score.
-        df = df.sort_values(by="energy_eff", ascending=False)
         # The full table where all the data are.
         self.full_df = df
@@ -47,11 +47,6 @@ class TableManager:
         """Read tables."""
         df_score = pd.read_csv(f"{data_dir}/score.csv")
-        # Compute average NLP metrics
-        columns = df_score.columns.to_list()
-        columns.remove("model")
-        df_score["nlp_average"] = df_score[columns].mean(axis=1)
         with open(f"{data_dir}/schema.yaml") as file:
             self.schema: dict[str, list] = yaml.safe_load(file)
@@ -71,16 +66,12 @@ class TableManager:
             raise ValueError(f"No benchmark CSV files were read from {data_dir=}.")
         df = pd.merge(res_df, df_score, on=["model"]).round(2)
-        # Energy efficiency is defined as the amount of average NLP performance
-        # the model gets per Joule of energy.
-        df["energy_eff"] = (df["nlp_average"] / df["energy"]).round(4)
         # Order columns.
         columns = df.columns.to_list()
         cols_to_order = ["model"]
         cols_to_order.extend(self.schema.keys())
-        cols_to_order.extend(["energy_eff", "energy", "nlp_average"])
         columns = cols_to_order + [col for col in columns if col not in cols_to_order]
         df = df[columns]
@@ -118,10 +109,19 @@ class TableManager:
         # Evaluate the formula and catch any error.
         try:
-            col = self.full_df.eval(formula)
-            if isinstance(col, pd.Series):
                 col = col.round(2)
-            self.full_df[column_name] = col
         except Exception as exc:
             return self.cur_df, self._format_msg(f"Invalid formula: {exc}")
@@ -132,8 +132,8 @@ class TableManager:
     def get_dropdown(self):
         columns = self.full_df.columns.tolist()[1:]
         return [
-            gr.Dropdown(value="nlp_average", choices=columns, label="X"),
-            gr.Dropdown(value="energy_eff", choices=columns, label="Y"),
             gr.Dropdown(choices=["None", *columns], label="Z (optional)"),
         ]
@@ -303,8 +303,8 @@ with block:
             with gr.Row():
                 with gr.Column(scale=3):
                     with gr.Row():
-                        colname_input = gr.Textbox("power", lines=1, label="Custom column name")
-                        formula_input = gr.Textbox("energy/latency", lines=1, label="Formula")
                 with gr.Column(scale=1):
                     with gr.Row():
                         add_col_btn = gr.Button("Add to table (⏎)", elem_classes=["btn-submit"])
@@ -312,6 +312,14 @@ with block:
                         clear_input_btn = gr.Button("Clear")
             with gr.Row():
                 add_col_message = gr.HTML("")
             colname_input.submit(
                 TableManager.add_column,
                 inputs=[tbm, colname_input, formula_input],
@@ -349,14 +357,7 @@ with block:
                     plot_width_input = gr.Textbox("600", lines=1, label="Width (px)")
                     plot_height_input = gr.Textbox("600", lines=1, label="Height (px)")
             with gr.Row():
-                # By default show a plot of average model quality vs energy consumption.
-                plot = gr.Plot(global_tbm.plot_scatter(
-                    width=plot_width_input.value,
-                    height=plot_height_input.value,
-                    x=axis_dropdowns[0].value,
-                    y=axis_dropdowns[1].value,
-                    z=axis_dropdowns[2].value,
-                )[0])
             with gr.Row():
                 plot_message = gr.HTML("")
             add_col_btn.click(TableManager.update_dropdown, inputs=tbm, outputs=axis_dropdowns)  # type: ignore

         df["model"] = df["model"].apply(format_model_link)
         # Sort by our 'energy efficiency' score.
+        df = df.sort_values(by="energy", ascending=True)
         # The full table where all the data are.
         self.full_df = df
         """Read tables."""
         df_score = pd.read_csv(f"{data_dir}/score.csv")
         with open(f"{data_dir}/schema.yaml") as file:
             self.schema: dict[str, list] = yaml.safe_load(file)
             raise ValueError(f"No benchmark CSV files were read from {data_dir=}.")
         df = pd.merge(res_df, df_score, on=["model"]).round(2)
         # Order columns.
         columns = df.columns.to_list()
         cols_to_order = ["model"]
         cols_to_order.extend(self.schema.keys())
+        cols_to_order.append("energy")
         columns = cols_to_order + [col for col in columns if col not in cols_to_order]
         df = df[columns]
         # Evaluate the formula and catch any error.
         try:
+            # Give the users some helper functions that can be used in the formula
+            # like "@sum(response_length)".
+            col = self.full_df.eval(
+                formula,
+                local_dict={"sum": sum, "len": len, "max": max, "min": min},
+            )
+            # Only round floating point columns.
+            if isinstance(col, pd.Series) and col.dtype.kind == "f":
                 col = col.round(2)
+            if column_name in self.full_df.columns:
+                self.full_df[column_name] = col
+            else:
+                self.full_df.insert(len(self.schema) + 1, column_name, col)
         except Exception as exc:
             return self.cur_df, self._format_msg(f"Invalid formula: {exc}")
     def get_dropdown(self):
         columns = self.full_df.columns.tolist()[1:]
         return [
+            gr.Dropdown(choices=columns, label="X"),
+            gr.Dropdown(choices=columns, label="Y"),
             gr.Dropdown(choices=["None", *columns], label="Z (optional)"),
         ]
             with gr.Row():
                 with gr.Column(scale=3):
                     with gr.Row():
+                        colname_input = gr.Textbox(lines=1, label="Custom column name")
+                        formula_input = gr.Textbox(lines=1, label="Formula (@sum, @len, @max, and @min are supported)")
                 with gr.Column(scale=1):
                     with gr.Row():
                         add_col_btn = gr.Button("Add to table (⏎)", elem_classes=["btn-submit"])
                         clear_input_btn = gr.Button("Clear")
             with gr.Row():
                 add_col_message = gr.HTML("")
+            gr.Examples(
+                examples=[
+                    ["power", "energy / latency"],
+                    ["token_per_joule", "response_length / energy"],
+                    ["verbose", "response_length > @sum(response_length) / @len(response_length)"],
+                ],
+                inputs=[colname_input, formula_input],
+            )
             colname_input.submit(
                 TableManager.add_column,
                 inputs=[tbm, colname_input, formula_input],
                     plot_width_input = gr.Textbox("600", lines=1, label="Width (px)")
                     plot_height_input = gr.Textbox("600", lines=1, label="Height (px)")
             with gr.Row():
+                plot = gr.Plot()
             with gr.Row():
                 plot_message = gr.HTML("")
             add_col_btn.click(TableManager.update_dropdown, inputs=tbm, outputs=axis_dropdowns)  # type: ignore