|
import json |
|
from collections import defaultdict |
|
from pathlib import Path |
|
|
|
import numpy as np |
|
import pandas as pd |
|
import gradio as gr |
|
from pandas import DataFrame |
|
from pandas.io.formats.style import Styler |
|
|
|
from content import * |
|
|
|
ARC = "arc" |
|
HELLASWAG = "hellaswag" |
|
MMLU = "mmlu" |
|
TRUTHFULQA = "truthfulqa" |
|
BENCHMARKS = [ARC, HELLASWAG, MMLU, TRUTHFULQA] |
|
|
|
METRICS = ["acc_norm", "acc_norm", "acc_norm", "mc2"] |
|
|
|
|
|
def collect_results() -> dict[tuple[str, str], dict[str, float]]: |
|
""" |
|
Collects results from the evals folder and returns a dictionary of results |
|
:return: a dictionary of results where the keys are typles of (model_name, language) and the values are |
|
dictionaries of the form {benchmark_name: performance_score} |
|
""" |
|
performance_dict = defaultdict(dict) |
|
for pfin in Path("evals").rglob("*.json"): |
|
data = json.loads(pfin.read_text(encoding="utf-8")) |
|
if "results" not in data or "config" not in data: |
|
continue |
|
results = data["results"] |
|
config = data["config"] |
|
if "model_args" not in config: |
|
continue |
|
|
|
model_args = config["model_args"].split(",") |
|
pretrained = [x for x in model_args if x.startswith("pretrained=")] |
|
if len(pretrained) != 1: |
|
continue |
|
pretrained = pretrained[0].split("=")[1] |
|
pretrained = pretrained.split("/")[-1] |
|
|
|
for lang_task, perfs in results.items(): |
|
task, lang = lang_task.split("_") |
|
assert task in BENCHMARKS |
|
|
|
if lang and task: |
|
metric = METRICS[BENCHMARKS.index(task)] |
|
p = round(perfs[metric] * 100, 1) |
|
performance_dict[(pretrained, lang)][task] = p |
|
|
|
return dict(performance_dict) |
|
|
|
|
|
def build_performance_df(performance_dict: dict[tuple[str, str], dict[str, float]]) -> DataFrame: |
|
""" |
|
Builds a dataframe from the performance dictionary |
|
:param performance_dict: a dictionary of results where the keys are typles of (model_name, language) and the values are |
|
dictionaries of the form {benchmark_name: performance_score} |
|
:return: a pd.DataFrame that has as rows the model names and as columns the benchmarks |
|
""" |
|
data = [] |
|
dutch_training_info = json.loads(Path(__file__).parent.joinpath("evals/dutch_models.json").read_text(encoding="utf-8")) |
|
|
|
for (pretrained, lang), perfs in performance_dict.items(): |
|
arc_perf = perfs.get(ARC, 0.0) |
|
hellaswag_perf = perfs.get(HELLASWAG, 0.0) |
|
mmlu_perf = perfs.get(MMLU, 0.0) |
|
truthfulqa_perf = perfs.get(TRUTHFULQA, 0.0) |
|
training_type = dutch_training_info.get(pretrained, "NA") |
|
|
|
avg = round((arc_perf + hellaswag_perf + mmlu_perf + truthfulqa_perf) / 4, 1) |
|
row = [pretrained, training_type, avg, arc_perf, hellaswag_perf, mmlu_perf, truthfulqa_perf] |
|
data.append(row) |
|
|
|
df = pd.DataFrame.from_records(data, columns=COLS) |
|
df = df.sort_values(by=[AVERAGE_COL], ascending=False) |
|
|
|
return df |
|
|
|
|
|
def style_df(df: DataFrame) -> Styler: |
|
""" |
|
Styles the dataframe by rounding to two decimals and putting the max value in bold per column |
|
:param df: the dataframe to style |
|
:return: the Styler |
|
""" |
|
styler = df.style.format("{:.2f}", subset=df.columns[2:]) |
|
|
|
def highlight_max(col): |
|
return np.where(col == np.nanmax(col.to_numpy()), "font-weight: bold;", None) |
|
|
|
styler = styler.apply(highlight_max, axis=1, subset=df.columns[2:]) |
|
styler = styler.hide() |
|
return styler |
|
|
|
|
|
MODEL_COL = "Model" |
|
AVERAGE_COL = "Average" |
|
ARC_COL = "ARC (25-shot)" |
|
HELLASWAG_COL = "HellaSwag (10-shot)️" |
|
MMLU_COL = "MMLU (5-shot)" |
|
TRUTHFULQA_COL = "TruthfulQA (0-shot)" |
|
TRAIN_TYPE_COL = "Training type" |
|
|
|
COLS = [MODEL_COL, TRAIN_TYPE_COL, AVERAGE_COL, ARC_COL, HELLASWAG_COL, MMLU_COL, TRUTHFULQA_COL] |
|
TYPES = ["str", "number", "number", "number", "number", "number"] |
|
|
|
results = collect_results() |
|
original_df = build_performance_df(results) |
|
styled_df = style_df(original_df) |
|
with gr.Blocks() as demo: |
|
gr.HTML(TITLE) |
|
gr.Markdown(INTRO_TEXT) |
|
|
|
gr.Markdown("## Leaderboard\nOnly representative for the Dutch version (`*_nl`) of the benchmarks!") |
|
gr.components.Dataframe( |
|
value=original_df, |
|
headers=COLS, |
|
datatype=TYPES, |
|
elem_id="leaderboard-table", |
|
) |
|
gr.Markdown("Training type: <code>PT</code>: pretrained on only/mostly Dutch; <code>FT</code>: **only** finetuned on" |
|
" Dutch; <code>NA</code> not specifically pretrained nor finetuned on Dutch but Dutch data may have been a (small) portion of the training data") |
|
|
|
gr.Markdown("## LaTeX") |
|
gr.Code(styled_df.to_latex(convert_css=True)) |
|
|
|
gr.Markdown(CREDIT, elem_classes="markdown-text") |
|
gr.Markdown(CITATION, elem_classes="markdown-text") |
|
|
|
if __name__ == '__main__': |
|
demo.launch() |
|
|
|
|