Update app.py to be compatile with the new logs
Browse files
app.py
CHANGED
@@ -6,10 +6,10 @@ import gradio as gr
|
|
6 |
from content import *
|
7 |
import glob
|
8 |
|
9 |
-
ARC = "
|
10 |
HELLASWAG = "hellaswag"
|
11 |
MMLU = "mmlu"
|
12 |
-
TRUTHFULQA = "truthfulqa
|
13 |
BENCHMARKS = [ARC, HELLASWAG, MMLU, TRUTHFULQA]
|
14 |
|
15 |
METRICS = ["acc_norm", "acc_norm", "acc_norm", "mc2"]
|
@@ -39,18 +39,8 @@ def collect_results():
|
|
39 |
pretrained_models.add(pretrained)
|
40 |
|
41 |
for lang_task, perfs in results.items():
|
42 |
-
|
43 |
-
|
44 |
-
task = ARC
|
45 |
-
elif lang_task.startswith('hellaswag_'):
|
46 |
-
_, lang = lang_task.split('_')
|
47 |
-
task = HELLASWAG
|
48 |
-
elif lang_task.startswith('mmlu_'):
|
49 |
-
_, lang = lang_task.split('_')
|
50 |
-
task = MMLU
|
51 |
-
elif lang_task.startswith('truthfulqa_') and lang_task.endswith('_mc'):
|
52 |
-
lang = lang_task.split('_')[1]
|
53 |
-
task = TRUTHFULQA
|
54 |
|
55 |
if lang and task:
|
56 |
metric = METRICS[BENCHMARKS.index(task)]
|
|
|
6 |
from content import *
|
7 |
import glob
|
8 |
|
9 |
+
ARC = "arc"
|
10 |
HELLASWAG = "hellaswag"
|
11 |
MMLU = "mmlu"
|
12 |
+
TRUTHFULQA = "truthfulqa"
|
13 |
BENCHMARKS = [ARC, HELLASWAG, MMLU, TRUTHFULQA]
|
14 |
|
15 |
METRICS = ["acc_norm", "acc_norm", "acc_norm", "mc2"]
|
|
|
39 |
pretrained_models.add(pretrained)
|
40 |
|
41 |
for lang_task, perfs in results.items():
|
42 |
+
task, lang = lang_task.split('_')
|
43 |
+
assert task in BENCHMARKS
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
44 |
|
45 |
if lang and task:
|
46 |
metric = METRICS[BENCHMARKS.index(task)]
|