Add search capability and language names
Browse files- app.py +70 -6
- content.py +1 -1
- css.py +13 -0
app.py
CHANGED
@@ -2,8 +2,10 @@ import os
|
|
2 |
import json
|
3 |
import glob
|
4 |
from collections import defaultdict
|
|
|
5 |
import gradio as gr
|
6 |
from content import *
|
|
|
7 |
import glob
|
8 |
|
9 |
ARC = "arc"
|
@@ -14,6 +16,42 @@ BENCHMARKS = [ARC, HELLASWAG, MMLU, TRUTHFULQA]
|
|
14 |
|
15 |
METRICS = ["acc_norm", "acc_norm", "acc_norm", "mc2"]
|
16 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
|
18 |
def collect_results():
|
19 |
performance_dict = defaultdict(dict)
|
@@ -52,6 +90,7 @@ def collect_results():
|
|
52 |
def get_leaderboard_df(performance_dict, pretrained_models):
|
53 |
df = list()
|
54 |
for (pretrained, lang), perfs in performance_dict.items():
|
|
|
55 |
arc_perf = perfs.get(ARC, 0.0)
|
56 |
hellaswag_perf = perfs.get(HELLASWAG, 0.0)
|
57 |
mmlu_perf = perfs.get(MMLU, 0.0)
|
@@ -60,26 +99,40 @@ def get_leaderboard_df(performance_dict, pretrained_models):
|
|
60 |
if arc_perf * hellaswag_perf * mmlu_perf * truthfulqa_perf == 0:
|
61 |
continue
|
62 |
avg = round((arc_perf + hellaswag_perf + mmlu_perf + truthfulqa_perf) / 4, 1)
|
63 |
-
|
|
|
64 |
df.append(row)
|
|
|
|
|
|
|
|
|
|
|
65 |
return df
|
66 |
|
67 |
|
|
|
|
|
|
|
|
|
|
|
|
|
68 |
MODEL_COL = "Model"
|
69 |
LANG_COL = "Language"
|
|
|
70 |
AVERAGE_COL = "Average"
|
71 |
ARC_COL = "ARC (25-shot)"
|
72 |
HELLASWAG_COL = "HellaSwag (10-shot)️"
|
73 |
MMLU_COL = "MMLU (5-shot)"
|
74 |
TRUTHFULQA_COL = "TruthfulQA (0-shot)"
|
|
|
75 |
|
76 |
-
COLS = [MODEL_COL, LANG_COL, AVERAGE_COL, ARC_COL, HELLASWAG_COL, MMLU_COL, TRUTHFULQA_COL]
|
77 |
-
TYPES = ["str", "str", "number", "number", "number", "number", "number"]
|
78 |
|
79 |
args = collect_results()
|
80 |
-
|
81 |
|
82 |
-
demo = gr.Blocks()
|
83 |
with demo:
|
84 |
gr.HTML(TITLE)
|
85 |
gr.Markdown(INTRO_TEXT, elem_classes="markdown-text")
|
@@ -91,13 +144,24 @@ with demo:
|
|
91 |
)
|
92 |
|
93 |
leaderboard_table = gr.components.Dataframe(
|
94 |
-
value=
|
95 |
headers=COLS,
|
96 |
datatype=TYPES,
|
97 |
max_rows=5,
|
98 |
elem_id="leaderboard-table",
|
99 |
)
|
100 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
101 |
gr.Markdown(CREDIT, elem_classes="markdown-text")
|
102 |
gr.Markdown(CITATION, elem_classes="markdown-text")
|
103 |
|
|
|
2 |
import json
|
3 |
import glob
|
4 |
from collections import defaultdict
|
5 |
+
import pandas as pd
|
6 |
import gradio as gr
|
7 |
from content import *
|
8 |
+
from css import *
|
9 |
import glob
|
10 |
|
11 |
ARC = "arc"
|
|
|
16 |
|
17 |
METRICS = ["acc_norm", "acc_norm", "acc_norm", "mc2"]
|
18 |
|
19 |
+
LANGS = 'ar,bn,ca,da,de,es,eu,fr,gu,hi,hr,hu,hy,id,it,kn,ml,mr,ne,nl,pt,ro,ru,sk,sr,sv,ta,te,uk,vi,zh'.split(',')
|
20 |
+
|
21 |
+
LANG_NAME = {
|
22 |
+
'ar': 'Arabic',
|
23 |
+
'bn': 'Bengali',
|
24 |
+
'ca': 'Catalan',
|
25 |
+
'da': 'Danish',
|
26 |
+
'de': 'German',
|
27 |
+
'es': 'Spanish',
|
28 |
+
'eu': 'Basque',
|
29 |
+
'fr': 'French',
|
30 |
+
'gu': 'Gujarati',
|
31 |
+
'hi': 'Hindi',
|
32 |
+
'hr': 'Croatian',
|
33 |
+
'hu': 'Hungarian',
|
34 |
+
'hy': 'Armenian',
|
35 |
+
'id': 'Indonesian',
|
36 |
+
'it': 'Italian',
|
37 |
+
'kn': 'Kannada',
|
38 |
+
'ml': 'Malayalam',
|
39 |
+
'mr': 'Marathi',
|
40 |
+
'ne': 'Nepali',
|
41 |
+
'nl': 'Dutch',
|
42 |
+
'pt': 'Portuguese',
|
43 |
+
'ro': 'Romanian',
|
44 |
+
'ru': 'Russian',
|
45 |
+
'sk': 'Slovak',
|
46 |
+
'sr': 'Serbian',
|
47 |
+
'sv': 'Swedish',
|
48 |
+
'ta': 'Tamil',
|
49 |
+
'te': 'Telugu',
|
50 |
+
'uk': 'Ukrainian',
|
51 |
+
'vi': 'Vietnamese',
|
52 |
+
'zh': 'Chinese'
|
53 |
+
}
|
54 |
+
|
55 |
|
56 |
def collect_results():
|
57 |
performance_dict = defaultdict(dict)
|
|
|
90 |
def get_leaderboard_df(performance_dict, pretrained_models):
|
91 |
df = list()
|
92 |
for (pretrained, lang), perfs in performance_dict.items():
|
93 |
+
lang_name = LANG_NAME[lang]
|
94 |
arc_perf = perfs.get(ARC, 0.0)
|
95 |
hellaswag_perf = perfs.get(HELLASWAG, 0.0)
|
96 |
mmlu_perf = perfs.get(MMLU, 0.0)
|
|
|
99 |
if arc_perf * hellaswag_perf * mmlu_perf * truthfulqa_perf == 0:
|
100 |
continue
|
101 |
avg = round((arc_perf + hellaswag_perf + mmlu_perf + truthfulqa_perf) / 4, 1)
|
102 |
+
notes = ' '.join([pretrained, lang_name, lang])
|
103 |
+
row = [pretrained, lang_name, lang, avg, arc_perf, hellaswag_perf, mmlu_perf, truthfulqa_perf, notes]
|
104 |
df.append(row)
|
105 |
+
|
106 |
+
df = pd.DataFrame.from_records(df, columns=COLS)
|
107 |
+
df = df.sort_values(by=[AVERAGE_COL], ascending=False)
|
108 |
+
df = df[COLS]
|
109 |
+
|
110 |
return df
|
111 |
|
112 |
|
113 |
+
def search_table(df, query):
|
114 |
+
filtered_df = df[df[NOTES_COL].str.contains(query, case=False)]
|
115 |
+
return filtered_df
|
116 |
+
|
117 |
+
|
118 |
+
|
119 |
MODEL_COL = "Model"
|
120 |
LANG_COL = "Language"
|
121 |
+
CODE_COL = "Code"
|
122 |
AVERAGE_COL = "Average"
|
123 |
ARC_COL = "ARC (25-shot)"
|
124 |
HELLASWAG_COL = "HellaSwag (10-shot)️"
|
125 |
MMLU_COL = "MMLU (5-shot)"
|
126 |
TRUTHFULQA_COL = "TruthfulQA (0-shot)"
|
127 |
+
NOTES_COL = "Notes" # For search only
|
128 |
|
129 |
+
COLS = [MODEL_COL, LANG_COL, CODE_COL, AVERAGE_COL, ARC_COL, HELLASWAG_COL, MMLU_COL, TRUTHFULQA_COL, NOTES_COL]
|
130 |
+
TYPES = ["str", "str", "str", "number", "number", "number", "number", "number", "str"]
|
131 |
|
132 |
args = collect_results()
|
133 |
+
original_df = get_leaderboard_df(*args)
|
134 |
|
135 |
+
demo = gr.Blocks(css=CUSTOM_CSS)
|
136 |
with demo:
|
137 |
gr.HTML(TITLE)
|
138 |
gr.Markdown(INTRO_TEXT, elem_classes="markdown-text")
|
|
|
144 |
)
|
145 |
|
146 |
leaderboard_table = gr.components.Dataframe(
|
147 |
+
value=original_df,
|
148 |
headers=COLS,
|
149 |
datatype=TYPES,
|
150 |
max_rows=5,
|
151 |
elem_id="leaderboard-table",
|
152 |
)
|
153 |
|
154 |
+
# # Dummy leaderboard for handling the case when the user uses backspace key
|
155 |
+
hidden_leaderboard_table_for_search = gr.components.Dataframe(
|
156 |
+
value=original_df, headers=COLS, datatype=TYPES, max_rows=5, visible=False
|
157 |
+
)
|
158 |
+
|
159 |
+
search_bar.change(
|
160 |
+
search_table,
|
161 |
+
[hidden_leaderboard_table_for_search, search_bar],
|
162 |
+
leaderboard_table,
|
163 |
+
)
|
164 |
+
|
165 |
gr.Markdown(CREDIT, elem_classes="markdown-text")
|
166 |
gr.Markdown(CITATION, elem_classes="markdown-text")
|
167 |
|
content.py
CHANGED
@@ -3,7 +3,7 @@ TITLE = '<h1 align="center" id="space-title">Open Multilingual LLM Evaluation Le
|
|
3 |
INTRO_TEXT = f"""
|
4 |
## About
|
5 |
|
6 |
-
This leaderboard shows the performance of pretrained models in 29 languages
|
7 |
|
8 |
- <a href="https://arxiv.org/abs/1803.05457" target="_blank"> AI2 Reasoning Challenge </a> (25-shot)
|
9 |
- <a href="https://arxiv.org/abs/1905.07830" target="_blank"> HellaSwag </a> (10-shot)
|
|
|
3 |
INTRO_TEXT = f"""
|
4 |
## About
|
5 |
|
6 |
+
This leaderboard shows the performance of pretrained models in 29 languages including Arabic, Armenian, Basque, Bengali, Catalan, Chinese, Croatian, Danish, Dutch, French, German, Gujarati, Hindi, Hungarian, Indonesian, Italian, Kannada, Malayalam, Marathi, Nepali, Portuguese, Romanian, Russian, Serbian, Slovak, Spanish, Swedish, Tamil, Telugu, Ukrainian, and Vietnameseon four benchmarks:
|
7 |
|
8 |
- <a href="https://arxiv.org/abs/1803.05457" target="_blank"> AI2 Reasoning Challenge </a> (25-shot)
|
9 |
- <a href="https://arxiv.org/abs/1905.07830" target="_blank"> HellaSwag </a> (10-shot)
|
css.py
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
CUSTOM_CSS= """
|
2 |
+
/* Hides the final column */
|
3 |
+
table td:last-child,
|
4 |
+
table th:last-child {
|
5 |
+
display: none;
|
6 |
+
}
|
7 |
+
# table td:first-child,
|
8 |
+
# table th:first-child {
|
9 |
+
# max-width: 400px;
|
10 |
+
# overflow: auto;
|
11 |
+
# white-space: nowrap;
|
12 |
+
# }
|
13 |
+
"""
|