Bram Vanroy
commited on
Commit
•
8e901a2
1
Parent(s):
575d1cf
update display
Browse files- README.md +1 -2
- app.py +200 -116
- content.py +5 -3
- evals/models.json +46 -16
- requirements.txt +1 -0
README.md
CHANGED
@@ -4,8 +4,7 @@ emoji: 🐨
|
|
4 |
colorFrom: purple
|
5 |
colorTo: blue
|
6 |
sdk: gradio
|
7 |
-
sdk_version:
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
---
|
11 |
-
|
|
|
4 |
colorFrom: purple
|
5 |
colorTo: blue
|
6 |
sdk: gradio
|
7 |
+
sdk_version: 4.8.0
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
---
|
|
app.py
CHANGED
@@ -1,8 +1,9 @@
|
|
1 |
import json
|
2 |
from collections import defaultdict
|
3 |
-
from dataclasses import dataclass, field
|
4 |
from functools import cached_property
|
5 |
from pathlib import Path
|
|
|
6 |
|
7 |
import numpy as np
|
8 |
import pandas as pd
|
@@ -12,41 +13,159 @@ from pandas.io.formats.style import Styler
|
|
12 |
|
13 |
from content import *
|
14 |
|
15 |
-
ARC = "arc"
|
16 |
-
HELLASWAG = "hellaswag"
|
17 |
-
MMLU = "mmlu"
|
18 |
-
TRUTHFULQA = "truthfulqa"
|
19 |
-
BENCHMARKS = [ARC, HELLASWAG, MMLU, TRUTHFULQA]
|
20 |
|
21 |
-
|
|
|
|
|
|
|
|
|
|
|
22 |
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
TRAIN_TYPE_COL = "Training type"
|
30 |
-
TRAIN_TYPE_COL = "Training type"
|
31 |
-
NUM_PARAMETERS = "Num. parameters"
|
32 |
|
33 |
|
34 |
@dataclass
|
35 |
class Result:
|
36 |
-
|
|
|
|
|
|
|
37 |
num_parameters: int
|
38 |
-
arc: float = field(default=0.)
|
39 |
-
|
40 |
-
|
41 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
42 |
|
43 |
@cached_property
|
44 |
-
def
|
45 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
|
47 |
@cached_property
|
48 |
-
def
|
49 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
50 |
|
51 |
|
52 |
def convert_number_to_kmb(number: int) -> str:
|
@@ -65,121 +184,86 @@ def convert_number_to_kmb(number: int) -> str:
|
|
65 |
return str(number)
|
66 |
|
67 |
|
68 |
-
|
69 |
-
def collect_results() -> dict[tuple[str, str], dict[str, float]]:
|
70 |
"""
|
71 |
Collects results from the evals folder and returns a dictionary of results
|
72 |
:return: a dictionary of results where the keys are typles of (model_name, language) and the values are
|
73 |
dictionaries of the form {benchmark_name: performance_score}
|
74 |
"""
|
75 |
-
|
76 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
77 |
data = json.loads(pfin.read_text(encoding="utf-8"))
|
78 |
-
if "results" not in data or "config" not in data:
|
79 |
-
continue
|
80 |
-
results = data["results"]
|
81 |
-
config = data["config"]
|
82 |
-
if "model_args" not in config:
|
83 |
-
continue
|
84 |
|
85 |
-
|
86 |
-
pretrained = [x for x in model_args if x.startswith("pretrained=")]
|
87 |
-
if len(pretrained) != 1:
|
88 |
continue
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
performance_dict[(pretrained, lang)][task] = p
|
100 |
-
|
101 |
-
return dict(performance_dict)
|
102 |
-
|
103 |
-
|
104 |
-
def build_performance_df(performance_dict: dict[tuple[str, str], dict[str, float]]) -> DataFrame:
|
105 |
-
"""
|
106 |
-
Builds a dataframe from the performance dictionary
|
107 |
-
:param performance_dict: a dictionary of results where the keys are typles of (model_name, language) and the values are
|
108 |
-
dictionaries of the form {benchmark_name: performance_score}
|
109 |
-
:return: a pd.DataFrame that has as rows the model names and as columns the benchmarks
|
110 |
-
"""
|
111 |
-
data = []
|
112 |
-
dutch_training_info = json.loads(Path(__file__).parent.joinpath("evals/dutch_models.json").read_text(encoding="utf-8"))
|
113 |
-
|
114 |
-
for (pretrained, lang), perfs in performance_dict.items():
|
115 |
-
arc_perf = perfs.get(ARC, 0.0)
|
116 |
-
hellaswag_perf = perfs.get(HELLASWAG, 0.0)
|
117 |
-
mmlu_perf = perfs.get(MMLU, 0.0)
|
118 |
-
truthfulqa_perf = perfs.get(TRUTHFULQA, 0.0)
|
119 |
-
training_type = dutch_training_info.get(pretrained, "NA")
|
120 |
-
|
121 |
-
avg = round((arc_perf + hellaswag_perf + mmlu_perf + truthfulqa_perf) / 4, 1)
|
122 |
-
row = [pretrained, training_type, avg, arc_perf, hellaswag_perf, mmlu_perf, truthfulqa_perf]
|
123 |
-
data.append(row)
|
124 |
-
|
125 |
-
df = pd.DataFrame.from_records(data, columns=COLS)
|
126 |
-
df = df.sort_values(by=[AVERAGE_COL], ascending=False)
|
127 |
-
|
128 |
-
return df
|
129 |
-
|
130 |
-
|
131 |
-
def style_df(df: DataFrame) -> Styler:
|
132 |
-
"""
|
133 |
-
Styles the dataframe by rounding to two decimals and putting the max value in bold per column
|
134 |
-
:param df: the dataframe to style
|
135 |
-
:return: the Styler
|
136 |
-
"""
|
137 |
-
styler = df.style.format("{:.2f}", subset=df.columns[2:])
|
138 |
|
139 |
-
|
140 |
-
|
|
|
|
|
141 |
|
142 |
-
|
143 |
-
styler = styler.hide()
|
144 |
-
return styler
|
145 |
|
|
|
146 |
|
147 |
-
MODEL_COL = "Model"
|
148 |
-
AVERAGE_COL = "Average"
|
149 |
-
ARC_COL = "ARC (25-shot)"
|
150 |
-
HELLASWAG_COL = "HellaSwag (10-shot)️"
|
151 |
-
MMLU_COL = "MMLU (5-shot)"
|
152 |
-
TRUTHFULQA_COL = "TruthfulQA (0-shot)"
|
153 |
-
TRAIN_TYPE_COL = "Training type"
|
154 |
-
TRAIN_TYPE_COL = "Training type"
|
155 |
-
NUM_PARAMETERS = "Num. parameters"
|
156 |
|
157 |
-
COLS = [MODEL_COL, TRAIN_TYPE_COL, AVERAGE_COL, ARC_COL, HELLASWAG_COL, MMLU_COL, TRUTHFULQA_COL]
|
158 |
-
TYPES = ["str", "number", "number", "number", "number", "number"]
|
159 |
-
|
160 |
-
results = collect_results()
|
161 |
-
original_df = build_performance_df(results)
|
162 |
-
styled_df = style_df(original_df)
|
163 |
with gr.Blocks() as demo:
|
164 |
gr.HTML(TITLE)
|
165 |
gr.Markdown(INTRO_TEXT)
|
166 |
|
167 |
-
gr.Markdown("## Leaderboard\nOnly representative for the Dutch version (`*_nl`) of the benchmarks!")
|
|
|
|
|
|
|
168 |
gr.components.Dataframe(
|
169 |
-
|
170 |
-
headers=
|
171 |
-
datatype=
|
|
|
172 |
elem_id="leaderboard-table",
|
173 |
)
|
174 |
-
|
175 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
176 |
|
177 |
gr.Markdown("## LaTeX")
|
178 |
-
gr.Code(
|
179 |
|
180 |
gr.Markdown(CREDIT, elem_classes="markdown-text")
|
181 |
gr.Markdown(CITATION, elem_classes="markdown-text")
|
182 |
|
183 |
-
if __name__ == '__main__':
|
184 |
-
demo.launch()
|
185 |
|
|
|
|
|
|
1 |
import json
|
2 |
from collections import defaultdict
|
3 |
+
from dataclasses import dataclass, field, fields
|
4 |
from functools import cached_property
|
5 |
from pathlib import Path
|
6 |
+
from typing import Literal
|
7 |
|
8 |
import numpy as np
|
9 |
import pandas as pd
|
|
|
13 |
|
14 |
from content import *
|
15 |
|
|
|
|
|
|
|
|
|
|
|
16 |
|
17 |
+
TASK_METRICS = {
|
18 |
+
"arc": "acc_norm",
|
19 |
+
"hellaswag": "acc_norm",
|
20 |
+
"mmlu": "acc_norm",
|
21 |
+
"truthfulqa": "mc2",
|
22 |
+
}
|
23 |
|
24 |
+
MODEL_TYPE_EMOJIS = {
|
25 |
+
"pretrained": "🟢",
|
26 |
+
"fine-tuned": "🔶",
|
27 |
+
"instruction-tuned": "⭕",
|
28 |
+
"RL-tuned": "🟦",
|
29 |
+
}
|
|
|
|
|
|
|
30 |
|
31 |
|
32 |
@dataclass
|
33 |
class Result:
|
34 |
+
model_name: str
|
35 |
+
short_name: str
|
36 |
+
model_type: Literal["pretrained", "fine-tuned", "instruction-tuned", "RL-tuned"]
|
37 |
+
dutch_coverage: Literal["none", "pretrained", "fine-tuned"]
|
38 |
num_parameters: int
|
39 |
+
arc: float = field(default=0.0)
|
40 |
+
average: float = field(default=0.0, init=False)
|
41 |
+
hellaswag: float = field(default=0.0)
|
42 |
+
mmlu: float = field(default=0.0)
|
43 |
+
truthfulqa: float = field(default=0.0)
|
44 |
+
num_parameters_kmb: str = field(init=False)
|
45 |
+
|
46 |
+
def __post_init__(self):
|
47 |
+
if self.model_type not in ["pretrained", "fine-tuned", "instruction-tuned", "RL-tuned"]:
|
48 |
+
raise ValueError(
|
49 |
+
f"Model type {self.model_type} must be one of 'pretrained', 'fine-tuned', 'instruction-tuned', 'RL-tuned'"
|
50 |
+
)
|
51 |
+
if self.dutch_coverage not in ["none", "pretrained", "fine-tuned"]:
|
52 |
+
raise ValueError(f"Dutch coverage {self.dutch_coverage} must be one of 'none', 'pretrained', 'fine-tuned'")
|
53 |
+
|
54 |
+
field_names = {f.name for f in fields(self)}
|
55 |
+
for task_name in TASK_METRICS:
|
56 |
+
if task_name not in field_names:
|
57 |
+
raise ValueError(f"Task name {task_name} not found in Result class fields so cannot create DataFrame")
|
58 |
+
|
59 |
+
self.average = (self.arc + self.hellaswag + self.mmlu + self.truthfulqa) / 4
|
60 |
+
self.num_parameters_kmb = convert_number_to_kmb(self.num_parameters)
|
61 |
+
|
62 |
+
|
63 |
+
@dataclass
|
64 |
+
class ResultSet:
|
65 |
+
results: list[Result]
|
66 |
+
column_names: dict[str, str] = field(default_factory=dict)
|
67 |
+
column_types: dict[str, str] = field(default_factory=dict)
|
68 |
+
|
69 |
+
def __post_init__(self):
|
70 |
+
if not self.column_names:
|
71 |
+
# Order will be the order of the columns in the DataFrame
|
72 |
+
self.column_names = {
|
73 |
+
"short_name": "Model",
|
74 |
+
"model_type": "T",
|
75 |
+
"dutch_coverage": "🇳🇱",
|
76 |
+
"num_parameters": "Size",
|
77 |
+
"average": "Avg.",
|
78 |
+
"arc": "ARC (25-shot)",
|
79 |
+
"hellaswag": "HellaSwag (10-shot)️",
|
80 |
+
"mmlu": "MMLU (5-shot)",
|
81 |
+
"truthfulqa": "TruthfulQA (0-shot)",
|
82 |
+
}
|
83 |
+
self.column_types = {
|
84 |
+
"Model": "markdown",
|
85 |
+
"T": "str",
|
86 |
+
"🇳🇱": "str",
|
87 |
+
"Size": "str",
|
88 |
+
"Avg.": "number",
|
89 |
+
"ARC (25-shot)": "number",
|
90 |
+
"HellaSwag (10-shot)️": "number",
|
91 |
+
"MMLU (5-shot)": "number",
|
92 |
+
"TruthfulQA (0-shot)": "number",
|
93 |
+
}
|
94 |
+
|
95 |
+
for column_type in self.column_types:
|
96 |
+
if column_type not in set(self.column_names.values()):
|
97 |
+
raise ValueError(
|
98 |
+
f"Column names specified in column_types must be values in column_names."
|
99 |
+
f" {column_type} not found."
|
100 |
+
)
|
101 |
+
|
102 |
+
if "average" not in self.column_names:
|
103 |
+
raise ValueError("Column names must contain 'average' column name")
|
104 |
+
|
105 |
+
field_names = [f.name for f in fields(Result)]
|
106 |
+
for column_name in self.column_names:
|
107 |
+
if column_name not in field_names:
|
108 |
+
raise ValueError(f"Column name {column_name} not found in Result class so cannot create DataFrame")
|
109 |
+
|
110 |
+
@cached_property
|
111 |
+
def df(self) -> DataFrame:
|
112 |
+
data = [
|
113 |
+
{
|
114 |
+
col_name: getattr(result, attr)
|
115 |
+
for attr, col_name in self.column_names.items()
|
116 |
+
}
|
117 |
+
for result in self.results
|
118 |
+
]
|
119 |
+
|
120 |
+
df = pd.DataFrame(data)
|
121 |
+
df = df.sort_values(by=self.column_names["average"], ascending=False)
|
122 |
+
return df
|
123 |
|
124 |
@cached_property
|
125 |
+
def styled_df(self) -> Styler:
|
126 |
+
data = [
|
127 |
+
{
|
128 |
+
col_name: (f"<a target='_blank' href='https://huggingface.co/{result.model_name}'"
|
129 |
+
f" style='color: var(--link-text-color); text-decoration: underline;text-decoration-style:"
|
130 |
+
f" dotted;'>{result.short_name}</a>")
|
131 |
+
if attr == "short_name"
|
132 |
+
else MODEL_TYPE_EMOJIS[result.model_type]
|
133 |
+
if attr == "model_type"
|
134 |
+
else getattr(result, attr)
|
135 |
+
for attr, col_name in self.column_names.items()
|
136 |
+
}
|
137 |
+
for result in self.results
|
138 |
+
]
|
139 |
+
|
140 |
+
df = pd.DataFrame(data)
|
141 |
+
df = df.sort_values(by=self.column_names["average"], ascending=False)
|
142 |
+
number_cols = [col for attr, col in self.column_names.items() if attr in TASK_METRICS or attr == "average"]
|
143 |
+
styler = df.style.format("{:.2f}", subset=number_cols)
|
144 |
+
|
145 |
+
def highlight_max(col):
|
146 |
+
return np.where(col == np.nanmax(col.to_numpy()), "font-weight: bold;", None)
|
147 |
+
|
148 |
+
styler = styler.apply(highlight_max, axis=0, subset=number_cols)
|
149 |
+
|
150 |
+
num_params_col = self.column_names["num_parameters"]
|
151 |
+
styler = styler.format(convert_number_to_kmb, subset=num_params_col)
|
152 |
+
|
153 |
+
styler = styler.hide()
|
154 |
+
return styler
|
155 |
|
156 |
@cached_property
|
157 |
+
def latex_df(self) -> Styler:
|
158 |
+
number_cols = [col for attr, col in self.column_names.items() if attr in TASK_METRICS or attr == "average"]
|
159 |
+
styler = self.df.style.format("{:.2f}", subset=number_cols)
|
160 |
+
|
161 |
+
def highlight_max(col):
|
162 |
+
return np.where(col == np.nanmax(col.to_numpy()), "font-weight: bold;", None)
|
163 |
+
|
164 |
+
styler = styler.apply(highlight_max, axis=1, subset=number_cols)
|
165 |
+
num_params_col = self.column_names["num_parameters"]
|
166 |
+
styler = styler.format(convert_number_to_kmb, subset=num_params_col)
|
167 |
+
styler = styler.hide()
|
168 |
+
return styler
|
169 |
|
170 |
|
171 |
def convert_number_to_kmb(number: int) -> str:
|
|
|
184 |
return str(number)
|
185 |
|
186 |
|
187 |
+
def collect_results() -> ResultSet:
|
|
|
188 |
"""
|
189 |
Collects results from the evals folder and returns a dictionary of results
|
190 |
:return: a dictionary of results where the keys are typles of (model_name, language) and the values are
|
191 |
dictionaries of the form {benchmark_name: performance_score}
|
192 |
"""
|
193 |
+
evals_dir = Path(__file__).parent.joinpath("evals")
|
194 |
+
pf_overview = evals_dir.joinpath("models.json")
|
195 |
+
if not pf_overview.exists():
|
196 |
+
raise ValueError(
|
197 |
+
f"Overview file {pf_overview} not found. Make sure to generate it first with `generate_overview_json.py`."
|
198 |
+
)
|
199 |
+
|
200 |
+
model_info = json.loads(pf_overview.read_text(encoding="utf-8"))
|
201 |
+
model_results = {}
|
202 |
+
for pfin in evals_dir.rglob("*.json"):
|
203 |
data = json.loads(pfin.read_text(encoding="utf-8"))
|
|
|
|
|
|
|
|
|
|
|
|
|
204 |
|
205 |
+
if "results" not in data:
|
|
|
|
|
206 |
continue
|
207 |
+
task_results = data["results"]
|
208 |
+
short_name = pfin.stem.split("_", 2)[2].lower()
|
209 |
+
if short_name not in model_results:
|
210 |
+
model_results[short_name] = {
|
211 |
+
"short_name": short_name,
|
212 |
+
"model_name": model_info[short_name]["model_name"],
|
213 |
+
"model_type": model_info[short_name]["model_type"],
|
214 |
+
"dutch_coverage": model_info[short_name]["dutch_coverage"],
|
215 |
+
"num_parameters": model_info[short_name]["num_parameters"],
|
216 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
217 |
|
218 |
+
for task_name, task_result in task_results.items():
|
219 |
+
task_name = task_name.rsplit("_", 1)[0]
|
220 |
+
metric = TASK_METRICS[task_name]
|
221 |
+
model_results[short_name][task_name] = task_result[metric]
|
222 |
|
223 |
+
model_results = ResultSet([Result(**res) for short_name, res in model_results.items()])
|
|
|
|
|
224 |
|
225 |
+
return model_results
|
226 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
227 |
|
|
|
|
|
|
|
|
|
|
|
|
|
228 |
with gr.Blocks() as demo:
|
229 |
gr.HTML(TITLE)
|
230 |
gr.Markdown(INTRO_TEXT)
|
231 |
|
232 |
+
gr.Markdown(f"## Leaderboard\nOnly representative for the Dutch version (`*_nl`) of the benchmarks!")
|
233 |
+
|
234 |
+
results = collect_results()
|
235 |
+
|
236 |
gr.components.Dataframe(
|
237 |
+
results.styled_df,
|
238 |
+
headers=list(results.df.columns),
|
239 |
+
datatype=[results.column_types[col] for col in results.df.columns], # To ensure same order as headers
|
240 |
+
interactive=False,
|
241 |
elem_id="leaderboard-table",
|
242 |
)
|
243 |
+
|
244 |
+
with gr.Row():
|
245 |
+
with gr.Column():
|
246 |
+
modeltypes_str = "<br>".join([f"- {emoji}: {modeltype}" for modeltype, emoji in MODEL_TYPE_EMOJIS.items()])
|
247 |
+
gr.Markdown(f"Model types:<br>{modeltypes_str}")
|
248 |
+
|
249 |
+
with gr.Column():
|
250 |
+
gr.Markdown(
|
251 |
+
f"Language coverage ({results.column_names['dutch_coverage']}):"
|
252 |
+
f"<br>- `none`: no explicit/deliverate Dutch coverage,"
|
253 |
+
f"<br>- `pretrained`: pretrained on Dutch data,"
|
254 |
+
f"<br>- `fine-tuned`: fine-tuned on Dutch data"
|
255 |
+
)
|
256 |
+
|
257 |
+
with gr.Column():
|
258 |
+
metrics_str = "<br>".join([f"- {task}: `{metric}`" for task, metric in TASK_METRICS.items()])
|
259 |
+
gr.Markdown(f"Reported metrics:<br>{metrics_str}")
|
260 |
|
261 |
gr.Markdown("## LaTeX")
|
262 |
+
gr.Code(results.latex_df.to_latex(convert_css=True))
|
263 |
|
264 |
gr.Markdown(CREDIT, elem_classes="markdown-text")
|
265 |
gr.Markdown(CITATION, elem_classes="markdown-text")
|
266 |
|
|
|
|
|
267 |
|
268 |
+
if __name__ == "__main__":
|
269 |
+
demo.launch()
|
content.py
CHANGED
@@ -1,8 +1,10 @@
|
|
1 |
-
TITLE = '<h1 align="center" id="space-title">Open
|
2 |
|
3 |
INTRO_TEXT = f"""
|
4 |
## About
|
5 |
|
|
|
|
|
6 |
This is a fork of the [Open Multilingual LLM Evaluation Leaderboard](https://huggingface.co/spaces/uonlp/open_multilingual_llm_leaderboard), but restricted to only Dutch models and augmented with additional model results.
|
7 |
We test the models on the following benchmarks **for the Dutch version only!!**, which have been translated into Dutch automatically by the original authors of the Open Multilingual LLM Evaluation Leaderboard with `gpt-35-turbo`.
|
8 |
|
@@ -13,7 +15,7 @@ We test the models on the following benchmarks **for the Dutch version only!!**,
|
|
13 |
|
14 |
I do not maintain those datasets, I only run benchmarks and add the results to this space. For questions regarding the test sets or running them yourself, see [the original Github repository](https://github.com/laiviet/lm-evaluation-harness).
|
15 |
|
16 |
-
|
17 |
"""
|
18 |
|
19 |
CREDIT = f"""
|
@@ -47,4 +49,4 @@ If you use the multilingual benchmarks, please cite the following paper:
|
|
47 |
year={{2023}}
|
48 |
}}
|
49 |
```
|
50 |
-
"""
|
|
|
1 |
+
TITLE = '<h1 align="center" id="space-title">Open Dutch LLM Evaluation Leaderboard</h1>'
|
2 |
|
3 |
INTRO_TEXT = f"""
|
4 |
## About
|
5 |
|
6 |
+
This is a leaderboard for Dutch benchmarks for large language models.
|
7 |
+
|
8 |
This is a fork of the [Open Multilingual LLM Evaluation Leaderboard](https://huggingface.co/spaces/uonlp/open_multilingual_llm_leaderboard), but restricted to only Dutch models and augmented with additional model results.
|
9 |
We test the models on the following benchmarks **for the Dutch version only!!**, which have been translated into Dutch automatically by the original authors of the Open Multilingual LLM Evaluation Leaderboard with `gpt-35-turbo`.
|
10 |
|
|
|
15 |
|
16 |
I do not maintain those datasets, I only run benchmarks and add the results to this space. For questions regarding the test sets or running them yourself, see [the original Github repository](https://github.com/laiviet/lm-evaluation-harness).
|
17 |
|
18 |
+
Disclaimer: I am aware that benchmarking models on *translated* data is not ideal. However, for Dutch there are no other options for generative models at the moment. If you have any suggestions for other Dutch benchmarks, please let me know so I can add them!
|
19 |
"""
|
20 |
|
21 |
CREDIT = f"""
|
|
|
49 |
year={{2023}}
|
50 |
}}
|
51 |
```
|
52 |
+
"""
|
evals/models.json
CHANGED
@@ -3,90 +3,120 @@
|
|
3 |
"compute_dtype": "bfloat16",
|
4 |
"model_name": "yhavinga/gpt-neo-1.3B-dutch",
|
5 |
"num_parameters": 1315575808,
|
6 |
-
"quantization": "8-bit"
|
|
|
|
|
7 |
},
|
8 |
"gpt-neo-125m-dutch": {
|
9 |
"compute_dtype": "bfloat16",
|
10 |
"model_name": "yhavinga/gpt-neo-125M-dutch",
|
11 |
"num_parameters": 125198592,
|
12 |
-
"quantization": "8-bit"
|
|
|
|
|
13 |
},
|
14 |
"gpt2-large-dutch": {
|
15 |
"compute_dtype": "bfloat16",
|
16 |
"model_name": "yhavinga/gpt2-large-dutch",
|
17 |
"num_parameters": 774030080,
|
18 |
-
"quantization": "8-bit"
|
|
|
|
|
19 |
},
|
20 |
"gpt2-medium-dutch": {
|
21 |
"compute_dtype": "bfloat16",
|
22 |
"model_name": "yhavinga/gpt2-medium-dutch",
|
23 |
"num_parameters": 354823168,
|
24 |
-
"quantization": "8-bit"
|
|
|
|
|
25 |
},
|
26 |
"llama-2-13b-chat-dutch": {
|
27 |
"compute_dtype": "bfloat16",
|
28 |
"model_name": "BramVanroy/Llama-2-13b-chat-dutch",
|
29 |
"num_parameters": 13015864320,
|
30 |
-
"quantization": "8-bit"
|
|
|
|
|
31 |
},
|
32 |
"llama-2-13b-chat-hf": {
|
33 |
"compute_dtype": "bfloat16",
|
34 |
"model_name": "meta-llama/Llama-2-13b-chat-hf",
|
35 |
"num_parameters": 13015864320,
|
36 |
-
"quantization": "8-bit"
|
|
|
|
|
37 |
},
|
38 |
"llama-2-13b-hf": {
|
39 |
"compute_dtype": "bfloat16",
|
40 |
"model_name": "meta-llama/Llama-2-13b-hf",
|
41 |
"num_parameters": 13015864320,
|
42 |
-
"quantization": "8-bit"
|
|
|
|
|
43 |
},
|
44 |
"llama-2-7b-chat-hf": {
|
45 |
"compute_dtype": "bfloat16",
|
46 |
"model_name": "meta-llama/Llama-2-7b-chat-hf",
|
47 |
"num_parameters": 6738415616,
|
48 |
-
"quantization": "8-bit"
|
|
|
|
|
49 |
},
|
50 |
"llama-2-7b-hf": {
|
51 |
"compute_dtype": "bfloat16",
|
52 |
"model_name": "meta-llama/Llama-2-7b-hf",
|
53 |
"num_parameters": 6738415616,
|
54 |
-
"quantization": "8-bit"
|
|
|
|
|
55 |
},
|
56 |
-
"llama2-13b-ft-
|
57 |
"compute_dtype": "bfloat16",
|
58 |
"model_name": "BramVanroy/llama2-13b-ft-mc4_nl_cleaned_tiny",
|
59 |
"num_parameters": 13015864320,
|
60 |
-
"quantization": "8-bit"
|
|
|
|
|
61 |
},
|
62 |
"mistral-7b-v0.1": {
|
63 |
"compute_dtype": "bfloat16",
|
64 |
"model_name": "mistralai/Mistral-7B-v0.1",
|
65 |
"num_parameters": 7241732096,
|
66 |
-
"quantization": "8-bit"
|
|
|
|
|
67 |
},
|
68 |
"neural-chat-7b-v3-1": {
|
69 |
"compute_dtype": "bfloat16",
|
70 |
"model_name": "Intel/neural-chat-7b-v3-1",
|
71 |
"num_parameters": 7241732096,
|
72 |
-
"quantization": "8-bit"
|
|
|
|
|
73 |
},
|
74 |
"orca-2-13b": {
|
75 |
"compute_dtype": "bfloat16",
|
76 |
"model_name": "microsoft/Orca-2-13b",
|
77 |
"num_parameters": 13015895040,
|
78 |
-
"quantization": "8-bit"
|
|
|
|
|
79 |
},
|
80 |
"orca-2-7b": {
|
81 |
"compute_dtype": "bfloat16",
|
82 |
"model_name": "microsoft/Orca-2-7b",
|
83 |
"num_parameters": 6738440192,
|
84 |
-
"quantization": "8-bit"
|
|
|
|
|
85 |
},
|
86 |
"zephyr-7b-beta": {
|
87 |
"compute_dtype": "bfloat16",
|
88 |
"model_name": "HuggingFaceH4/zephyr-7b-beta",
|
89 |
"num_parameters": 7241732096,
|
90 |
-
"quantization": "8-bit"
|
|
|
|
|
91 |
}
|
92 |
}
|
|
|
3 |
"compute_dtype": "bfloat16",
|
4 |
"model_name": "yhavinga/gpt-neo-1.3B-dutch",
|
5 |
"num_parameters": 1315575808,
|
6 |
+
"quantization": "8-bit",
|
7 |
+
"model_type": "pretrained",
|
8 |
+
"dutch_coverage": "pretrained"
|
9 |
},
|
10 |
"gpt-neo-125m-dutch": {
|
11 |
"compute_dtype": "bfloat16",
|
12 |
"model_name": "yhavinga/gpt-neo-125M-dutch",
|
13 |
"num_parameters": 125198592,
|
14 |
+
"quantization": "8-bit",
|
15 |
+
"model_type": "pretrained",
|
16 |
+
"dutch_coverage": "pretrained"
|
17 |
},
|
18 |
"gpt2-large-dutch": {
|
19 |
"compute_dtype": "bfloat16",
|
20 |
"model_name": "yhavinga/gpt2-large-dutch",
|
21 |
"num_parameters": 774030080,
|
22 |
+
"quantization": "8-bit",
|
23 |
+
"model_type": "pretrained",
|
24 |
+
"dutch_coverage": "pretrained"
|
25 |
},
|
26 |
"gpt2-medium-dutch": {
|
27 |
"compute_dtype": "bfloat16",
|
28 |
"model_name": "yhavinga/gpt2-medium-dutch",
|
29 |
"num_parameters": 354823168,
|
30 |
+
"quantization": "8-bit",
|
31 |
+
"model_type": "pretrained",
|
32 |
+
"dutch_coverage": "pretrained"
|
33 |
},
|
34 |
"llama-2-13b-chat-dutch": {
|
35 |
"compute_dtype": "bfloat16",
|
36 |
"model_name": "BramVanroy/Llama-2-13b-chat-dutch",
|
37 |
"num_parameters": 13015864320,
|
38 |
+
"quantization": "8-bit",
|
39 |
+
"model_type": "instruction-tuned",
|
40 |
+
"dutch_coverage": "fine-tuned"
|
41 |
},
|
42 |
"llama-2-13b-chat-hf": {
|
43 |
"compute_dtype": "bfloat16",
|
44 |
"model_name": "meta-llama/Llama-2-13b-chat-hf",
|
45 |
"num_parameters": 13015864320,
|
46 |
+
"quantization": "8-bit",
|
47 |
+
"model_type": "instruction-tuned",
|
48 |
+
"dutch_coverage": "none"
|
49 |
},
|
50 |
"llama-2-13b-hf": {
|
51 |
"compute_dtype": "bfloat16",
|
52 |
"model_name": "meta-llama/Llama-2-13b-hf",
|
53 |
"num_parameters": 13015864320,
|
54 |
+
"quantization": "8-bit",
|
55 |
+
"model_type": "pretrained",
|
56 |
+
"dutch_coverage": "none"
|
57 |
},
|
58 |
"llama-2-7b-chat-hf": {
|
59 |
"compute_dtype": "bfloat16",
|
60 |
"model_name": "meta-llama/Llama-2-7b-chat-hf",
|
61 |
"num_parameters": 6738415616,
|
62 |
+
"quantization": "8-bit",
|
63 |
+
"model_type": "instruction-tuned",
|
64 |
+
"dutch_coverage": "none"
|
65 |
},
|
66 |
"llama-2-7b-hf": {
|
67 |
"compute_dtype": "bfloat16",
|
68 |
"model_name": "meta-llama/Llama-2-7b-hf",
|
69 |
"num_parameters": 6738415616,
|
70 |
+
"quantization": "8-bit",
|
71 |
+
"model_type": "pretrained",
|
72 |
+
"dutch_coverage": "none"
|
73 |
},
|
74 |
+
"llama2-13b-ft-mc4_nl_cleaned_tiny": {
|
75 |
"compute_dtype": "bfloat16",
|
76 |
"model_name": "BramVanroy/llama2-13b-ft-mc4_nl_cleaned_tiny",
|
77 |
"num_parameters": 13015864320,
|
78 |
+
"quantization": "8-bit",
|
79 |
+
"model_type": "fine-tuned",
|
80 |
+
"dutch_coverage": "fine-tuned"
|
81 |
},
|
82 |
"mistral-7b-v0.1": {
|
83 |
"compute_dtype": "bfloat16",
|
84 |
"model_name": "mistralai/Mistral-7B-v0.1",
|
85 |
"num_parameters": 7241732096,
|
86 |
+
"quantization": "8-bit",
|
87 |
+
"model_type": "pretrained",
|
88 |
+
"dutch_coverage": "none"
|
89 |
},
|
90 |
"neural-chat-7b-v3-1": {
|
91 |
"compute_dtype": "bfloat16",
|
92 |
"model_name": "Intel/neural-chat-7b-v3-1",
|
93 |
"num_parameters": 7241732096,
|
94 |
+
"quantization": "8-bit",
|
95 |
+
"model_type": "RL-tuned",
|
96 |
+
"dutch_coverage": "none"
|
97 |
},
|
98 |
"orca-2-13b": {
|
99 |
"compute_dtype": "bfloat16",
|
100 |
"model_name": "microsoft/Orca-2-13b",
|
101 |
"num_parameters": 13015895040,
|
102 |
+
"quantization": "8-bit",
|
103 |
+
"model_type": "fine-tuned",
|
104 |
+
"dutch_coverage": "none"
|
105 |
},
|
106 |
"orca-2-7b": {
|
107 |
"compute_dtype": "bfloat16",
|
108 |
"model_name": "microsoft/Orca-2-7b",
|
109 |
"num_parameters": 6738440192,
|
110 |
+
"quantization": "8-bit",
|
111 |
+
"model_type": "fine-tuned",
|
112 |
+
"dutch_coverage": "none"
|
113 |
},
|
114 |
"zephyr-7b-beta": {
|
115 |
"compute_dtype": "bfloat16",
|
116 |
"model_name": "HuggingFaceH4/zephyr-7b-beta",
|
117 |
"num_parameters": 7241732096,
|
118 |
+
"quantization": "8-bit",
|
119 |
+
"model_type": "RL-tuned",
|
120 |
+
"dutch_coverage": "none"
|
121 |
}
|
122 |
}
|
requirements.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
gradio==4.8.0
|