huseinzol05
commited on
Commit
β’
fee8f50
1
Parent(s):
488cadc
added initial scores
Browse files- .gitignore +1 -0
- README.md +1 -1
- app.py +51 -0
- css_html_js.py +111 -0
- requirements.txt +1 -0
.gitignore
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
*__pycache__
|
README.md
CHANGED
@@ -9,4 +9,4 @@ app_file: app.py
|
|
9 |
pinned: false
|
10 |
---
|
11 |
|
12 |
-
|
|
|
9 |
pinned: false
|
10 |
---
|
11 |
|
12 |
+
Notebooks at https://github.com/mesolitica/llm-benchmarks
|
app.py
ADDED
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import pandas as pd
|
3 |
+
from css_html_js import custom_css
|
4 |
+
|
5 |
+
demo = gr.Blocks(css=custom_css)
|
6 |
+
|
7 |
+
TITLE = """<h1 align="center" id="space-title">π€ Malay LLM Leaderboard</h1>"""
|
8 |
+
|
9 |
+
INTRODUCTION_TEXT = """
|
10 |
+
π The π€ Malay LLM Leaderboard aims to track, rank and evaluate open LLMs on Malay tasks.\n
|
11 |
+
π€ All notebooks at https://github.com/mesolitica/llm-benchmarks, feel free to submit your own score at https://huggingface.co/spaces/mesolitica/malay-llm-leaderboard/discussions with link to the notebook.
|
12 |
+
|
13 |
+
## Dataset
|
14 |
+
|
15 |
+
π We evaluate models based on 4 datasets,
|
16 |
+
|
17 |
+
1. BM-PT3 Paper 1, contains 54 questions, https://github.com/mesolitica/malaysian-dataset/tree/master/llm-benchmark/BM-pt3
|
18 |
+
2. BM Paper 1, contains 180 questions, https://github.com/mesolitica/malaysian-dataset/tree/master/llm-benchmark/tatabahasabm.tripod.com-bm-kertas-1
|
19 |
+
3. Tatabahasa, contains 349 questions, https://github.com/mesolitica/malaysian-dataset/tree/master/llm-benchmark/tatabahasabm.tripod.com
|
20 |
+
4. Translated IndoNLI to Malay, tested on `test_expert` dataset, https://huggingface.co/datasets/mesolitica/translated-indonli
|
21 |
+
"""
|
22 |
+
|
23 |
+
data = [
|
24 |
+
{
|
25 |
+
'model': 'gpt-3.5-turbo-0613',
|
26 |
+
'BM-PT3 0-shot (% correct)': 36.53846153846153,
|
27 |
+
'BM-PT3 1-shot (% correct)': 28.846153846153843,
|
28 |
+
'BM-PT3 3-shots (% correct)': 24.528301886792452,
|
29 |
+
},
|
30 |
+
{
|
31 |
+
'model': 'malaysian-llama2-7b-32k',
|
32 |
+
'BM-PT3 0-shot (% correct)': 20.37037037037037,
|
33 |
+
'BM-PT3 1-shot (% correct)': 16.666666666666664,
|
34 |
+
'BM-PT3 3-shots (% correct)': 27.77777777777778,
|
35 |
+
},
|
36 |
+
{
|
37 |
+
'model': 'malaysian-llama2-13b-32k',
|
38 |
+
'BM-PT3 0-shot (% correct)': 33.33333333333333,
|
39 |
+
'BM-PT3 1-shot (% correct)': 24.074074074074073,
|
40 |
+
'BM-PT3 3-shots (% correct)': 25.925925925925924,
|
41 |
+
}
|
42 |
+
]
|
43 |
+
|
44 |
+
data = pd.DataFrame(data)
|
45 |
+
|
46 |
+
with demo:
|
47 |
+
gr.HTML(TITLE)
|
48 |
+
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
49 |
+
gr.DataFrame(data)
|
50 |
+
|
51 |
+
demo.launch()
|
css_html_js.py
ADDED
@@ -0,0 +1,111 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
custom_css = """
|
2 |
+
|
3 |
+
.markdown-text {
|
4 |
+
font-size: 16px !important;
|
5 |
+
}
|
6 |
+
|
7 |
+
#models-to-add-text {
|
8 |
+
font-size: 18px !important;
|
9 |
+
}
|
10 |
+
|
11 |
+
#citation-button span {
|
12 |
+
font-size: 16px !important;
|
13 |
+
}
|
14 |
+
|
15 |
+
#citation-button textarea {
|
16 |
+
font-size: 16px !important;
|
17 |
+
}
|
18 |
+
|
19 |
+
#citation-button > label > button {
|
20 |
+
margin: 6px;
|
21 |
+
transform: scale(1.3);
|
22 |
+
}
|
23 |
+
|
24 |
+
#leaderboard-table {
|
25 |
+
margin-top: 15px
|
26 |
+
}
|
27 |
+
|
28 |
+
#leaderboard-table-lite {
|
29 |
+
margin-top: 15px
|
30 |
+
}
|
31 |
+
|
32 |
+
#search-bar-table-box > div:first-child {
|
33 |
+
background: none;
|
34 |
+
border: none;
|
35 |
+
}
|
36 |
+
|
37 |
+
#search-bar {
|
38 |
+
padding: 0px;
|
39 |
+
}
|
40 |
+
|
41 |
+
/* Hides the final AutoEvalColumn */
|
42 |
+
#llm-benchmark-tab-table table td:last-child,
|
43 |
+
#llm-benchmark-tab-table table th:last-child {
|
44 |
+
display: none;
|
45 |
+
}
|
46 |
+
|
47 |
+
/* Limit the width of the first AutoEvalColumn so that names don't expand too much */
|
48 |
+
table td:first-child,
|
49 |
+
table th:first-child {
|
50 |
+
max-width: 400px;
|
51 |
+
overflow: auto;
|
52 |
+
white-space: nowrap;
|
53 |
+
}
|
54 |
+
|
55 |
+
.tab-buttons button {
|
56 |
+
font-size: 20px;
|
57 |
+
}
|
58 |
+
|
59 |
+
#scale-logo {
|
60 |
+
border-style: none !important;
|
61 |
+
box-shadow: none;
|
62 |
+
display: block;
|
63 |
+
margin-left: auto;
|
64 |
+
margin-right: auto;
|
65 |
+
max-width: 600px;
|
66 |
+
}
|
67 |
+
|
68 |
+
#scale-logo .download {
|
69 |
+
display: none;
|
70 |
+
}
|
71 |
+
#filter_type{
|
72 |
+
border: 0;
|
73 |
+
padding-left: 0;
|
74 |
+
padding-top: 0;
|
75 |
+
}
|
76 |
+
#filter_type label {
|
77 |
+
display: flex;
|
78 |
+
}
|
79 |
+
#filter_type label > span{
|
80 |
+
margin-top: var(--spacing-lg);
|
81 |
+
margin-right: 0.5em;
|
82 |
+
}
|
83 |
+
#filter_type label > .wrap{
|
84 |
+
width: 103px;
|
85 |
+
}
|
86 |
+
#filter_type label > .wrap .wrap-inner{
|
87 |
+
padding: 2px;
|
88 |
+
}
|
89 |
+
#filter_type label > .wrap .wrap-inner input{
|
90 |
+
width: 1px
|
91 |
+
}
|
92 |
+
#filter-columns-type{
|
93 |
+
border:0;
|
94 |
+
padding:0.5;
|
95 |
+
}
|
96 |
+
#filter-columns-size{
|
97 |
+
border:0;
|
98 |
+
padding:0.5;
|
99 |
+
}
|
100 |
+
#box-filter > .form{
|
101 |
+
border: 0
|
102 |
+
}
|
103 |
+
"""
|
104 |
+
|
105 |
+
get_window_url_params = """
|
106 |
+
function(url_params) {
|
107 |
+
const params = new URLSearchParams(window.location.search);
|
108 |
+
url_params = Object.fromEntries(params);
|
109 |
+
return url_params;
|
110 |
+
}
|
111 |
+
"""
|
requirements.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
pandas
|