huseinzol05
commited on
Commit
β’
a38e13d
1
Parent(s):
a551fbc
improve scores
Browse files
app.py
CHANGED
@@ -4,52 +4,81 @@ from css_html_js import custom_css
|
|
4 |
|
5 |
demo = gr.Blocks(css=custom_css)
|
6 |
|
7 |
-
TITLE = """<h1 align="center" id="space-title"
|
8 |
|
9 |
INTRODUCTION_TEXT = """
|
10 |
-
π The
|
11 |
π€ All notebooks at https://github.com/mesolitica/llm-benchmarks, feel free to submit your own score at https://huggingface.co/spaces/mesolitica/malay-llm-leaderboard/discussions with link to the notebook.
|
12 |
|
13 |
## Dataset
|
14 |
|
15 |
-
π We evaluate models based on
|
16 |
|
17 |
1. BM-PT3 Paper 1, contains 54 questions, https://github.com/mesolitica/malaysian-dataset/tree/master/llm-benchmark/BM-pt3
|
18 |
-
2.
|
19 |
-
3.
|
20 |
-
4. Translated IndoNLI to Malay, tested on `test_expert` dataset, https://huggingface.co/datasets/mesolitica/translated-indonli
|
21 |
"""
|
22 |
|
23 |
data = [
|
24 |
{
|
25 |
'model': 'gpt-3.5-turbo-0613',
|
26 |
-
'BM-PT3 0-shot
|
27 |
-
'BM-PT3 1-shot
|
28 |
-
'BM-PT3 3-shots
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
},
|
30 |
{
|
31 |
'model': 'malaysian-llama2-7b-32k',
|
32 |
-
'BM-PT3 0-shot
|
33 |
-
'BM-PT3 1-shot
|
34 |
-
'BM-PT3 3-shots
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
},
|
36 |
{
|
37 |
'model': 'malaysian-llama2-13b-32k',
|
38 |
-
'BM-PT3 0-shot
|
39 |
-
'BM-PT3 1-shot
|
40 |
-
'BM-PT3 3-shots
|
|
|
|
|
|
|
|
|
|
|
|
|
41 |
},
|
42 |
{
|
43 |
'model': 'malaysian-mistral-7b-4k',
|
44 |
-
'BM-PT3 0-shot
|
45 |
-
'BM-PT3 1-shot
|
46 |
-
'BM-PT3 3-shots
|
|
|
|
|
|
|
47 |
},
|
48 |
{
|
49 |
'model': 'malaysian-mistral-7b-32k',
|
50 |
-
'BM-PT3 0-shot
|
51 |
-
'BM-PT3 1-shot
|
52 |
-
'BM-PT3 3-shots
|
|
|
|
|
|
|
|
|
|
|
|
|
53 |
}
|
54 |
]
|
55 |
|
|
|
4 |
|
5 |
demo = gr.Blocks(css=custom_css)
|
6 |
|
7 |
+
TITLE = """<h1 align="center" id="space-title">π²πΎ Malay LLM Leaderboard</h1>"""
|
8 |
|
9 |
INTRODUCTION_TEXT = """
|
10 |
+
π The π²πΎ Malay LLM Leaderboard aims to track, rank and evaluate open LLMs on Malay tasks.\n
|
11 |
π€ All notebooks at https://github.com/mesolitica/llm-benchmarks, feel free to submit your own score at https://huggingface.co/spaces/mesolitica/malay-llm-leaderboard/discussions with link to the notebook.
|
12 |
|
13 |
## Dataset
|
14 |
|
15 |
+
π We evaluate models based on 3 datasets,
|
16 |
|
17 |
1. BM-PT3 Paper 1, contains 54 questions, https://github.com/mesolitica/malaysian-dataset/tree/master/llm-benchmark/BM-pt3
|
18 |
+
2. Tatabahasa, contains 349 questions, https://github.com/mesolitica/malaysian-dataset/tree/master/llm-benchmark/tatabahasabm.tripod.com
|
19 |
+
3. Translated IndoNLI to Malay, tested on `test_expert` dataset, https://huggingface.co/datasets/mesolitica/translated-indonli
|
|
|
20 |
"""
|
21 |
|
22 |
data = [
|
23 |
{
|
24 |
'model': 'gpt-3.5-turbo-0613',
|
25 |
+
'BM-PT3 0-shot': 36.53846153846153,
|
26 |
+
'BM-PT3 1-shot': 28.846153846153843,
|
27 |
+
'BM-PT3 3-shots': 24.528301886792452,
|
28 |
+
'Tatabahasa 0-shot': 59.530791788856305,
|
29 |
+
'Tatabahasa 1-shot': 60.80691642651297,
|
30 |
+
'Tatabahasa 3-shots': 63.03724928366762,
|
31 |
+
},
|
32 |
+
{
|
33 |
+
'model': 'gpt-4-1106-preview',
|
34 |
+
'Tatabahasa 0-shot': 75.64469914040114,
|
35 |
+
'Tatabahasa 1-shot': 73.63896848137536,
|
36 |
+
'Tatabahasa 3-shots': 75.64469914040114,
|
37 |
},
|
38 |
{
|
39 |
'model': 'malaysian-llama2-7b-32k',
|
40 |
+
'BM-PT3 0-shot': 20.37037037037037,
|
41 |
+
'BM-PT3 1-shot': 20.37037037037037,
|
42 |
+
'BM-PT3 3-shots': 29.629629629629626,
|
43 |
+
},
|
44 |
+
{
|
45 |
+
'model': 'malaysian-llama2-7b-32k-instructions',
|
46 |
+
'BM-PT3 0-shot': 35.294117647058826,
|
47 |
+
'BM-PT3 1-shot': 21.153846153846153,
|
48 |
+
'BM-PT3 3-shots': 28.30188679245283,
|
49 |
},
|
50 |
{
|
51 |
'model': 'malaysian-llama2-13b-32k',
|
52 |
+
'BM-PT3 0-shot': 33.33333333333333,
|
53 |
+
'BM-PT3 1-shot': 20.37037037037037,
|
54 |
+
'BM-PT3 3-shots': 31.48148148148148,
|
55 |
+
},
|
56 |
+
{
|
57 |
+
'model': 'malaysian-llama2-13b-32k-instructions',
|
58 |
+
'BM-PT3 0-shot': 28.57142857142857,
|
59 |
+
'BM-PT3 1-shot': 12.244897959183673,
|
60 |
+
'BM-PT3 3-shots': 17.307692307692307,
|
61 |
},
|
62 |
{
|
63 |
'model': 'malaysian-mistral-7b-4k',
|
64 |
+
'BM-PT3 0-shot': 20.37037037037037,
|
65 |
+
'BM-PT3 1-shot': 22.22222222222222,
|
66 |
+
'BM-PT3 3-shots': 33.33333333333333,
|
67 |
+
'Tatabahasa 0-shot': 21.48997134670487,
|
68 |
+
'Tatabahasa 1-shot': 28.939828080229223,
|
69 |
+
'Tatabahasa 3-shots': 24.641833810888254,
|
70 |
},
|
71 |
{
|
72 |
'model': 'malaysian-mistral-7b-32k',
|
73 |
+
'BM-PT3 0-shot': 16.666666666666664,
|
74 |
+
'BM-PT3 1-shot': 16.666666666666664,
|
75 |
+
'BM-PT3 3-shots': 25.925925925925924,
|
76 |
+
},
|
77 |
+
{
|
78 |
+
'model': 'malaysian-mistral-7b-32k-instructions',
|
79 |
+
'BM-PT3 0-shot': 21.568627450980394,
|
80 |
+
'BM-PT3 1-shot': 31.25,
|
81 |
+
'BM-PT3 3-shots': 28.000000000000004,
|
82 |
}
|
83 |
]
|
84 |
|