Bram Vanroy commited on
Commit
2c801d0
1 Parent(s): 851256b

add training type

Browse files
Files changed (1) hide show
  1. app.py +11 -4
app.py CHANGED
@@ -62,18 +62,22 @@ def build_performance_df(performance_dict: dict[tuple[str, str], dict[str, float
62
  :return: a pd.DataFrame that has as rows the model names and as columns the benchmarks
63
  """
64
  data = []
 
 
65
  for (pretrained, lang), perfs in performance_dict.items():
66
  arc_perf = perfs.get(ARC, 0.0)
67
  hellaswag_perf = perfs.get(HELLASWAG, 0.0)
68
  mmlu_perf = perfs.get(MMLU, 0.0)
69
  truthfulqa_perf = perfs.get(TRUTHFULQA, 0.0)
 
70
 
71
  avg = round((arc_perf + hellaswag_perf + mmlu_perf + truthfulqa_perf) / 4, 1)
72
- row = [pretrained, avg, arc_perf, hellaswag_perf, mmlu_perf, truthfulqa_perf]
73
  data.append(row)
74
 
75
  df = pd.DataFrame.from_records(data, columns=COLS)
76
  df = df.sort_values(by=[AVERAGE_COL], ascending=False)
 
77
  return df
78
 
79
 
@@ -83,12 +87,12 @@ def style_df(df: DataFrame) -> Styler:
83
  :param df: the dataframe to style
84
  :return: the Styler
85
  """
86
- styler = df.style.format("{:.2f}", subset=df.columns[1:])
87
 
88
  def highlight_max(col):
89
  return np.where(col == np.nanmax(col.to_numpy()), "font-weight: bold;", None)
90
 
91
- styler = styler.apply(highlight_max, axis=1, subset=df.columns[1:])
92
  styler = styler.hide()
93
  return styler
94
 
@@ -99,8 +103,9 @@ ARC_COL = "ARC (25-shot)"
99
  HELLASWAG_COL = "HellaSwag (10-shot)️"
100
  MMLU_COL = "MMLU (5-shot)"
101
  TRUTHFULQA_COL = "TruthfulQA (0-shot)"
 
102
 
103
- COLS = [MODEL_COL, AVERAGE_COL, ARC_COL, HELLASWAG_COL, MMLU_COL, TRUTHFULQA_COL]
104
  TYPES = ["str", "number", "number", "number", "number", "number"]
105
 
106
  results = collect_results()
@@ -117,6 +122,8 @@ with gr.Blocks() as demo:
117
  datatype=TYPES,
118
  elem_id="leaderboard-table",
119
  )
 
 
120
 
121
  gr.Markdown("## LaTeX")
122
  gr.Code(styled_df.to_latex(convert_css=True))
 
62
  :return: a pd.DataFrame that has as rows the model names and as columns the benchmarks
63
  """
64
  data = []
65
+ dutch_training_info = json.loads(Path(__file__).parent.joinpath("evals/dutch_models.json").read_text(encoding="utf-8"))
66
+
67
  for (pretrained, lang), perfs in performance_dict.items():
68
  arc_perf = perfs.get(ARC, 0.0)
69
  hellaswag_perf = perfs.get(HELLASWAG, 0.0)
70
  mmlu_perf = perfs.get(MMLU, 0.0)
71
  truthfulqa_perf = perfs.get(TRUTHFULQA, 0.0)
72
+ training_type = dutch_training_info.get(pretrained, "NA")
73
 
74
  avg = round((arc_perf + hellaswag_perf + mmlu_perf + truthfulqa_perf) / 4, 1)
75
+ row = [pretrained, training_type, avg, arc_perf, hellaswag_perf, mmlu_perf, truthfulqa_perf]
76
  data.append(row)
77
 
78
  df = pd.DataFrame.from_records(data, columns=COLS)
79
  df = df.sort_values(by=[AVERAGE_COL], ascending=False)
80
+
81
  return df
82
 
83
 
 
87
  :param df: the dataframe to style
88
  :return: the Styler
89
  """
90
+ styler = df.style.format("{:.2f}", subset=df.columns[2:])
91
 
92
  def highlight_max(col):
93
  return np.where(col == np.nanmax(col.to_numpy()), "font-weight: bold;", None)
94
 
95
+ styler = styler.apply(highlight_max, axis=1, subset=df.columns[2:])
96
  styler = styler.hide()
97
  return styler
98
 
 
103
  HELLASWAG_COL = "HellaSwag (10-shot)️"
104
  MMLU_COL = "MMLU (5-shot)"
105
  TRUTHFULQA_COL = "TruthfulQA (0-shot)"
106
+ TRAIN_TYPE_COL = "Training type"
107
 
108
+ COLS = [MODEL_COL, TRAIN_TYPE_COL, AVERAGE_COL, ARC_COL, HELLASWAG_COL, MMLU_COL, TRUTHFULQA_COL]
109
  TYPES = ["str", "number", "number", "number", "number", "number"]
110
 
111
  results = collect_results()
 
122
  datatype=TYPES,
123
  elem_id="leaderboard-table",
124
  )
125
+ gr.Markdown("Training type: <code>PT</code>: pretrained on only/mostly Dutch; <code>FT</code>: **only** finetuned on"
126
+ " Dutch; <code>NA</code> not specifically pretrained nor finetuned on Dutch but Dutch data may have been a (small) portion of the training data")
127
 
128
  gr.Markdown("## LaTeX")
129
  gr.Code(styled_df.to_latex(convert_css=True))