open_dutch_llm_leaderboard

Running

App Files Files Community

Bram Vanroy commited on Nov 30, 2023

Commit

2c801d0

•

1 Parent(s): 851256b

add training type

Browse files

Files changed (1) hide show

app.py +11 -4

app.py CHANGED Viewed

@@ -62,18 +62,22 @@ def build_performance_df(performance_dict: dict[tuple[str, str], dict[str, float
     :return: a pd.DataFrame that has as rows the model names and as columns the benchmarks
     """
     data = []
     for (pretrained, lang), perfs in performance_dict.items():
         arc_perf = perfs.get(ARC, 0.0)
         hellaswag_perf = perfs.get(HELLASWAG, 0.0)
         mmlu_perf = perfs.get(MMLU, 0.0)
         truthfulqa_perf = perfs.get(TRUTHFULQA, 0.0)
         avg = round((arc_perf + hellaswag_perf + mmlu_perf + truthfulqa_perf) / 4, 1)
-        row = [pretrained, avg, arc_perf, hellaswag_perf, mmlu_perf, truthfulqa_perf]
         data.append(row)
     df = pd.DataFrame.from_records(data, columns=COLS)
     df = df.sort_values(by=[AVERAGE_COL], ascending=False)
     return df
@@ -83,12 +87,12 @@ def style_df(df: DataFrame) -> Styler:
     :param df: the dataframe to style
     :return: the Styler
     """
-    styler = df.style.format("{:.2f}", subset=df.columns[1:])
     def highlight_max(col):
         return np.where(col == np.nanmax(col.to_numpy()), "font-weight: bold;", None)
-    styler = styler.apply(highlight_max, axis=1, subset=df.columns[1:])
     styler = styler.hide()
     return styler
@@ -99,8 +103,9 @@ ARC_COL = "ARC (25-shot)"
 HELLASWAG_COL = "HellaSwag (10-shot)️"
 MMLU_COL = "MMLU (5-shot)"
 TRUTHFULQA_COL = "TruthfulQA (0-shot)"
-COLS = [MODEL_COL, AVERAGE_COL, ARC_COL, HELLASWAG_COL, MMLU_COL, TRUTHFULQA_COL]
 TYPES = ["str", "number", "number", "number", "number", "number"]
 results = collect_results()
@@ -117,6 +122,8 @@ with gr.Blocks() as demo:
         datatype=TYPES,
         elem_id="leaderboard-table",
     )
     gr.Markdown("## LaTeX")
     gr.Code(styled_df.to_latex(convert_css=True))

     :return: a pd.DataFrame that has as rows the model names and as columns the benchmarks
     """
     data = []
+    dutch_training_info = json.loads(Path(__file__).parent.joinpath("evals/dutch_models.json").read_text(encoding="utf-8"))
     for (pretrained, lang), perfs in performance_dict.items():
         arc_perf = perfs.get(ARC, 0.0)
         hellaswag_perf = perfs.get(HELLASWAG, 0.0)
         mmlu_perf = perfs.get(MMLU, 0.0)
         truthfulqa_perf = perfs.get(TRUTHFULQA, 0.0)
+        training_type = dutch_training_info.get(pretrained, "NA")
         avg = round((arc_perf + hellaswag_perf + mmlu_perf + truthfulqa_perf) / 4, 1)
+        row = [pretrained, training_type, avg, arc_perf, hellaswag_perf, mmlu_perf, truthfulqa_perf]
         data.append(row)
     df = pd.DataFrame.from_records(data, columns=COLS)
     df = df.sort_values(by=[AVERAGE_COL], ascending=False)
     return df
     :param df: the dataframe to style
     :return: the Styler
     """
+    styler = df.style.format("{:.2f}", subset=df.columns[2:])
     def highlight_max(col):
         return np.where(col == np.nanmax(col.to_numpy()), "font-weight: bold;", None)
+    styler = styler.apply(highlight_max, axis=1, subset=df.columns[2:])
     styler = styler.hide()
     return styler
 HELLASWAG_COL = "HellaSwag (10-shot)️"
 MMLU_COL = "MMLU (5-shot)"
 TRUTHFULQA_COL = "TruthfulQA (0-shot)"
+TRAIN_TYPE_COL = "Training type"
+COLS = [MODEL_COL, TRAIN_TYPE_COL, AVERAGE_COL, ARC_COL, HELLASWAG_COL, MMLU_COL, TRUTHFULQA_COL]
 TYPES = ["str", "number", "number", "number", "number", "number"]
 results = collect_results()
         datatype=TYPES,
         elem_id="leaderboard-table",
     )
+    gr.Markdown("Training type: <code>PT</code>: pretrained on only/mostly Dutch; <code>FT</code>: **only** finetuned on"
+            " Dutch; <code>NA</code> not specifically pretrained nor finetuned on Dutch but Dutch data may have been a (small) portion of the training data")
     gr.Markdown("## LaTeX")
     gr.Code(styled_df.to_latex(convert_css=True))