Bram Vanroy
commited on
Commit
•
0658988
1
Parent(s):
351f9fe
remove mixtral - was not tested in 8-bit
Browse files- app.py +23 -6
- evals/hellaswag/hellaswag_nl_Mistral-7B-v0.1.json +0 -23
- evals/hellaswag/hellaswag_nl_Mixtral-8x7B-v0.1.json +0 -23
- evals/models.json +0 -8
- generate_overview_json.py +2 -1
app.py
CHANGED
@@ -28,6 +28,8 @@ MODEL_TYPE_EMOJIS = {
|
|
28 |
"RL-tuned": "🟦",
|
29 |
}
|
30 |
|
|
|
|
|
31 |
|
32 |
@dataclass
|
33 |
class Result:
|
@@ -44,12 +46,14 @@ class Result:
|
|
44 |
num_parameters_kmb: str = field(init=False)
|
45 |
|
46 |
def __post_init__(self):
|
47 |
-
if self.model_type not in ["pretrained", "fine-tuned", "instruction-tuned", "RL-tuned"]:
|
|
|
|
|
|
|
|
|
48 |
raise ValueError(
|
49 |
-
f"
|
50 |
)
|
51 |
-
if self.dutch_coverage not in ["none", "pretrained", "fine-tuned"]:
|
52 |
-
raise ValueError(f"Dutch coverage {self.dutch_coverage} must be one of 'none', 'pretrained', 'fine-tuned'")
|
53 |
|
54 |
field_names = {f.name for f in fields(self)}
|
55 |
for task_name in TASK_METRICS:
|
@@ -128,8 +132,10 @@ class ResultSet:
|
|
128 |
f" dotted;'>{result.short_name}</a>"
|
129 |
)
|
130 |
if attr == "short_name"
|
131 |
-
else MODEL_TYPE_EMOJIS
|
132 |
if attr == "model_type"
|
|
|
|
|
133 |
else getattr(result, attr)
|
134 |
for attr, col_name in self.column_names.items()
|
135 |
}
|
@@ -203,8 +209,16 @@ def collect_results() -> ResultSet:
|
|
203 |
|
204 |
if "results" not in data:
|
205 |
continue
|
|
|
206 |
task_results = data["results"]
|
207 |
short_name = pfin.stem.split("_", 2)[2].lower()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
208 |
if short_name not in model_results:
|
209 |
model_results[short_name] = {
|
210 |
"short_name": short_name,
|
@@ -228,7 +242,10 @@ with gr.Blocks() as demo:
|
|
228 |
gr.HTML(TITLE)
|
229 |
gr.Markdown(INTRO_TEXT)
|
230 |
|
231 |
-
gr.Markdown(
|
|
|
|
|
|
|
232 |
|
233 |
results = collect_results()
|
234 |
|
|
|
28 |
"RL-tuned": "🟦",
|
29 |
}
|
30 |
|
31 |
+
NOT_GIVEN_SYMBOL = "❔"
|
32 |
+
|
33 |
|
34 |
@dataclass
|
35 |
class Result:
|
|
|
46 |
num_parameters_kmb: str = field(init=False)
|
47 |
|
48 |
def __post_init__(self):
|
49 |
+
if self.model_type not in ["pretrained", "fine-tuned", "instruction-tuned", "RL-tuned", "not-given"]:
|
50 |
+
raise ValueError(
|
51 |
+
f"Model type {self.model_type} must be one of 'pretrained', 'fine-tuned', 'instruction-tuned', 'RL-tuned', 'not-given"
|
52 |
+
)
|
53 |
+
if self.dutch_coverage not in ["none", "pretrained", "fine-tuned", "not-given"]:
|
54 |
raise ValueError(
|
55 |
+
f"Dutch coverage {self.dutch_coverage} must be one of 'none', 'pretrained', 'fine-tuned', 'not-given"
|
56 |
)
|
|
|
|
|
57 |
|
58 |
field_names = {f.name for f in fields(self)}
|
59 |
for task_name in TASK_METRICS:
|
|
|
132 |
f" dotted;'>{result.short_name}</a>"
|
133 |
)
|
134 |
if attr == "short_name"
|
135 |
+
else MODEL_TYPE_EMOJIS.get(result.model_type, NOT_GIVEN_SYMBOL)
|
136 |
if attr == "model_type"
|
137 |
+
else (result.dutch_coverage if result.dutch_coverage != "not-given" else NOT_GIVEN_SYMBOL)
|
138 |
+
if attr == "dutch_coverage"
|
139 |
else getattr(result, attr)
|
140 |
for attr, col_name in self.column_names.items()
|
141 |
}
|
|
|
209 |
|
210 |
if "results" not in data:
|
211 |
continue
|
212 |
+
|
213 |
task_results = data["results"]
|
214 |
short_name = pfin.stem.split("_", 2)[2].lower()
|
215 |
+
|
216 |
+
if short_name not in model_info:
|
217 |
+
raise KeyError(
|
218 |
+
f"Model {short_name} not found in overview file {pf_overview.name}. This means that a results JSON"
|
219 |
+
f" file exists that has not yet been processed. First run the `generate_overview_json.py` script."
|
220 |
+
)
|
221 |
+
|
222 |
if short_name not in model_results:
|
223 |
model_results[short_name] = {
|
224 |
"short_name": short_name,
|
|
|
242 |
gr.HTML(TITLE)
|
243 |
gr.Markdown(INTRO_TEXT)
|
244 |
|
245 |
+
gr.Markdown(
|
246 |
+
f"## Leaderboard\nOnly representative for the Dutch version (`*_nl`) of the benchmarks!"
|
247 |
+
" All models have been benchmarked in 8-bit."
|
248 |
+
)
|
249 |
|
250 |
results = collect_results()
|
251 |
|
evals/hellaswag/hellaswag_nl_Mistral-7B-v0.1.json
DELETED
@@ -1,23 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"hellaswag_nl": {
|
4 |
-
"acc": 0.44079870480302213,
|
5 |
-
"acc_stderr": 0.005158280633507224,
|
6 |
-
"acc_norm": 0.5840259039395574,
|
7 |
-
"acc_norm_stderr": 0.005120942804814836
|
8 |
-
}
|
9 |
-
},
|
10 |
-
"versions": {
|
11 |
-
"hellaswag_nl": 1
|
12 |
-
},
|
13 |
-
"config": {
|
14 |
-
"model": "hf-auto",
|
15 |
-
"model_args": "pretrained=mistralai/Mistral-7B-v0.1,use_accelerate=True,device_map_option=auto,dtype=bfloat16,load_in_8bit=True",
|
16 |
-
"batch_size": "auto",
|
17 |
-
"device": "cuda",
|
18 |
-
"no_cache": false,
|
19 |
-
"limit": null,
|
20 |
-
"bootstrap_iters": 100000,
|
21 |
-
"description_dict": {}
|
22 |
-
}
|
23 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evals/hellaswag/hellaswag_nl_Mixtral-8x7B-v0.1.json
DELETED
@@ -1,23 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"hellaswag_nl": {
|
4 |
-
"acc": 0.5143011332973556,
|
5 |
-
"acc_stderr": 0.0051926973681393875,
|
6 |
-
"acc_norm": 0.67835941716136,
|
7 |
-
"acc_norm_stderr": 0.004853064643337017
|
8 |
-
}
|
9 |
-
},
|
10 |
-
"versions": {
|
11 |
-
"hellaswag_nl": 1
|
12 |
-
},
|
13 |
-
"config": {
|
14 |
-
"model": "hf-auto",
|
15 |
-
"model_args": "pretrained=mistralai/Mixtral-8x7B-v0.1,use_accelerate=True,device_map_option=auto,dtype=auto",
|
16 |
-
"batch_size": 1,
|
17 |
-
"device": "cuda",
|
18 |
-
"no_cache": false,
|
19 |
-
"limit": null,
|
20 |
-
"bootstrap_iters": 100000,
|
21 |
-
"description_dict": {}
|
22 |
-
}
|
23 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evals/models.json
CHANGED
@@ -87,14 +87,6 @@
|
|
87 |
"num_parameters": 7241732096,
|
88 |
"quantization": "8-bit"
|
89 |
},
|
90 |
-
"mixtral-8x7b-v0.1": {
|
91 |
-
"compute_dtype": "auto",
|
92 |
-
"dutch_coverage": "not-given",
|
93 |
-
"model_name": "mistralai/Mixtral-8x7B-v0.1",
|
94 |
-
"model_type": "not-given",
|
95 |
-
"num_parameters": 46702792704,
|
96 |
-
"quantization": null
|
97 |
-
},
|
98 |
"neural-chat-7b-v3-1": {
|
99 |
"compute_dtype": "bfloat16",
|
100 |
"dutch_coverage": "none",
|
|
|
87 |
"num_parameters": 7241732096,
|
88 |
"quantization": "8-bit"
|
89 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
90 |
"neural-chat-7b-v3-1": {
|
91 |
"compute_dtype": "bfloat16",
|
92 |
"dutch_coverage": "none",
|
generate_overview_json.py
CHANGED
@@ -40,7 +40,8 @@ def main():
|
|
40 |
"model_type": results[short_name]["model_type"]
|
41 |
if short_name in results and "model_type" in results[short_name]
|
42 |
else "not-given",
|
43 |
-
"dutch_coverage": results[short_name]["dutch_coverage"]
|
|
|
44 |
else "not-given",
|
45 |
}
|
46 |
|
|
|
40 |
"model_type": results[short_name]["model_type"]
|
41 |
if short_name in results and "model_type" in results[short_name]
|
42 |
else "not-given",
|
43 |
+
"dutch_coverage": results[short_name]["dutch_coverage"]
|
44 |
+
if short_name in results and "dutch_coverage" in results[short_name]
|
45 |
else "not-given",
|
46 |
}
|
47 |
|