File size: 4,830 Bytes
d1253a8 863e074 5693ee5 8c2ee0f d1253a8 5693ee5 863e074 f067bfb d1253a8 a5244e0 d1253a8 a5244e0 d1253a8 5693ee5 d1253a8 863e074 5693ee5 d1253a8 863e074 d1253a8 863e074 d1253a8 863e074 d1253a8 863e074 a5244e0 d1253a8 863e074 5693ee5 d1253a8 5693ee5 2c801d0 d1253a8 2c801d0 d1253a8 2c801d0 5693ee5 8c2ee0f 5693ee5 863e074 2c801d0 d1253a8 5693ee5 2c801d0 5693ee5 2c801d0 559e4ba 5693ee5 8c2ee0f d1253a8 f067bfb d1253a8 2c801d0 d1253a8 2c801d0 5693ee5 d1253a8 5693ee5 f067bfb 5693ee5 2c801d0 5693ee5 8c2ee0f 06aa2d9 f067bfb 5693ee5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 |
import json
from collections import defaultdict
from pathlib import Path
import numpy as np
import pandas as pd
import gradio as gr
from pandas import DataFrame
from pandas.io.formats.style import Styler
from content import *
ARC = "arc"
HELLASWAG = "hellaswag"
MMLU = "mmlu"
TRUTHFULQA = "truthfulqa"
BENCHMARKS = [ARC, HELLASWAG, MMLU, TRUTHFULQA]
METRICS = ["acc_norm", "acc_norm", "acc_norm", "mc2"]
def collect_results() -> dict[tuple[str, str], dict[str, float]]:
"""
Collects results from the evals folder and returns a dictionary of results
:return: a dictionary of results where the keys are typles of (model_name, language) and the values are
dictionaries of the form {benchmark_name: performance_score}
"""
performance_dict = defaultdict(dict)
for pfin in Path("evals").rglob("*.json"):
data = json.loads(pfin.read_text(encoding="utf-8"))
if "results" not in data or "config" not in data:
continue
results = data["results"]
config = data["config"]
if "model_args" not in config:
continue
model_args = config["model_args"].split(",")
pretrained = [x for x in model_args if x.startswith("pretrained=")]
if len(pretrained) != 1:
continue
pretrained = pretrained[0].split("=")[1]
pretrained = pretrained.split("/")[-1]
for lang_task, perfs in results.items():
task, lang = lang_task.split("_")
assert task in BENCHMARKS
if lang and task:
metric = METRICS[BENCHMARKS.index(task)]
p = round(perfs[metric] * 100, 1)
performance_dict[(pretrained, lang)][task] = p
return dict(performance_dict)
def build_performance_df(performance_dict: dict[tuple[str, str], dict[str, float]]) -> DataFrame:
"""
Builds a dataframe from the performance dictionary
:param performance_dict: a dictionary of results where the keys are typles of (model_name, language) and the values are
dictionaries of the form {benchmark_name: performance_score}
:return: a pd.DataFrame that has as rows the model names and as columns the benchmarks
"""
data = []
dutch_training_info = json.loads(Path(__file__).parent.joinpath("evals/dutch_models.json").read_text(encoding="utf-8"))
for (pretrained, lang), perfs in performance_dict.items():
arc_perf = perfs.get(ARC, 0.0)
hellaswag_perf = perfs.get(HELLASWAG, 0.0)
mmlu_perf = perfs.get(MMLU, 0.0)
truthfulqa_perf = perfs.get(TRUTHFULQA, 0.0)
training_type = dutch_training_info.get(pretrained, "NA")
avg = round((arc_perf + hellaswag_perf + mmlu_perf + truthfulqa_perf) / 4, 1)
row = [pretrained, training_type, avg, arc_perf, hellaswag_perf, mmlu_perf, truthfulqa_perf]
data.append(row)
df = pd.DataFrame.from_records(data, columns=COLS)
df = df.sort_values(by=[AVERAGE_COL], ascending=False)
return df
def style_df(df: DataFrame) -> Styler:
"""
Styles the dataframe by rounding to two decimals and putting the max value in bold per column
:param df: the dataframe to style
:return: the Styler
"""
styler = df.style.format("{:.2f}", subset=df.columns[2:])
def highlight_max(col):
return np.where(col == np.nanmax(col.to_numpy()), "font-weight: bold;", None)
styler = styler.apply(highlight_max, axis=1, subset=df.columns[2:])
styler = styler.hide()
return styler
MODEL_COL = "Model"
AVERAGE_COL = "Average"
ARC_COL = "ARC (25-shot)"
HELLASWAG_COL = "HellaSwag (10-shot)️"
MMLU_COL = "MMLU (5-shot)"
TRUTHFULQA_COL = "TruthfulQA (0-shot)"
TRAIN_TYPE_COL = "Training type"
COLS = [MODEL_COL, TRAIN_TYPE_COL, AVERAGE_COL, ARC_COL, HELLASWAG_COL, MMLU_COL, TRUTHFULQA_COL]
TYPES = ["str", "number", "number", "number", "number", "number"]
results = collect_results()
original_df = build_performance_df(results)
styled_df = style_df(original_df)
with gr.Blocks() as demo:
gr.HTML(TITLE)
gr.Markdown(INTRO_TEXT)
gr.Markdown("## Leaderboard\nOnly representative for the Dutch version (`*_nl`) of the benchmarks!")
gr.components.Dataframe(
value=original_df,
headers=COLS,
datatype=TYPES,
elem_id="leaderboard-table",
)
gr.Markdown("Training type: <code>PT</code>: pretrained on only/mostly Dutch; <code>FT</code>: **only** finetuned on"
" Dutch; <code>NA</code> not specifically pretrained nor finetuned on Dutch but Dutch data may have been a (small) portion of the training data")
gr.Markdown("## LaTeX")
gr.Code(styled_df.to_latex(convert_css=True))
gr.Markdown(CREDIT, elem_classes="markdown-text")
gr.Markdown(CITATION, elem_classes="markdown-text")
if __name__ == '__main__':
demo.launch()
|