File size: 5,508 Bytes
fee8f50
 
 
 
 
 
a38e13d
fee8f50
 
a38e13d
fee8f50
 
 
 
a38e13d
fee8f50
 
a38e13d
 
fee8f50
 
 
a38e13d
 
24d4f12
 
 
a38e13d
 
 
fee8f50
ac7a85c
 
 
 
 
 
 
 
 
fee8f50
0822410
 
 
 
 
 
 
a38e13d
 
 
4b8f341
 
 
a38e13d
 
0822410
a38e13d
 
 
fee8f50
 
0822410
a38e13d
 
 
0822410
 
 
a38e13d
 
0822410
a38e13d
 
 
0822410
 
 
 
 
 
a551fbc
 
0822410
a38e13d
 
 
 
 
 
a551fbc
 
0822410
a38e13d
 
 
0822410
 
 
a38e13d
 
0822410
a38e13d
 
 
ac7a85c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fee8f50
 
 
 
 
 
 
 
0822410
fee8f50
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
import gradio as gr
import pandas as pd
from css_html_js import custom_css

demo = gr.Blocks(css=custom_css)

TITLE = """<h1 align="center" id="space-title">πŸ‡²πŸ‡Ύ Malay LLM Leaderboard</h1>"""

INTRODUCTION_TEXT = """
πŸ“ The πŸ‡²πŸ‡Ύ Malay LLM Leaderboard aims to track, rank and evaluate open LLMs on Malay tasks.\n
πŸ€— All notebooks at https://github.com/mesolitica/llm-benchmarks, feel free to submit your own score at https://huggingface.co/spaces/mesolitica/malay-llm-leaderboard/discussions with link to the notebook.

## Dataset

πŸ“ˆ We evaluate models based on 3 datasets,

1. BM-PT3 Paper 1, contains 54 questions, https://github.com/mesolitica/malaysian-dataset/tree/master/llm-benchmark/BM-pt3
2. Tatabahasa, contains 349 questions, https://github.com/mesolitica/malaysian-dataset/tree/master/llm-benchmark/tatabahasabm.tripod.com
3. Translated IndoNLI to Malay, tested on `test_expert` dataset, https://huggingface.co/datasets/mesolitica/translated-indonli
"""

data = [
    {
        'model': 'gpt-4-1106-preview',
        'BM-PT3 0-shot': 51.85185185185185,
        'BM-PT3 1-shot': 66.66666666666666,
        'BM-PT3 3-shots': 55.55555555555556,
        'Tatabahasa 0-shot': 75.64469914040114,
        'Tatabahasa 1-shot': 73.63896848137536,
        'Tatabahasa 3-shots': 75.64469914040114,
    },
    {
        'model': 'gpt-3.5-turbo-0613',
        'BM-PT3 0-shot': 36.53846153846153,
        'BM-PT3 1-shot': 28.846153846153843,
        'BM-PT3 3-shots': 24.528301886792452,
        'Tatabahasa 0-shot': 59.530791788856305,
        'Tatabahasa 1-shot': 60.80691642651297,
        'Tatabahasa 3-shots': 63.03724928366762,
    },
    {
        'model': '[llama2-7b](https://huggingface.co/meta-llama/Llama-2-7b-hf)',
        'Tatabahasa 0-shot': 24.355300859598856,
        'Tatabahasa 1-shot': 28.08022922636103,
        'Tatabahasa 3-shots': 24.641833810888254,
    },
    {
        'model': '[malaysian-llama2-7b-32k](https://huggingface.co/mesolitica/llama-7b-hf-32768-fpf)',
        'BM-PT3 0-shot': 20.37037037037037,
        'BM-PT3 1-shot': 20.37037037037037,
        'BM-PT3 3-shots': 29.629629629629626,
        'Tatabahasa 0-shot': 17.765042979942695,
        'Tatabahasa 1-shot': 24.068767908309454,
        'Tatabahasa 3-shots': 27.507163323782237,
    },
    {
        'model': '[malaysian-llama2-7b-32k-instructions](https://huggingface.co/mesolitica/malaysian-llama2-7b-32k-instructions)',
        'BM-PT3 0-shot': 35.294117647058826,
        'BM-PT3 1-shot': 21.153846153846153,
        'BM-PT3 3-shots': 28.30188679245283,
    },
    {
        'model': '[malaysian-llama2-13b-32k](https://huggingface.co/mesolitica/llama-13b-hf-32768-fpf)',
        'BM-PT3 0-shot': 33.33333333333333,
        'BM-PT3 1-shot': 20.37037037037037,
        'BM-PT3 3-shots': 31.48148148148148,
        'Tatabahasa 0-shot': 26.07449856733524,
        'Tatabahasa 1-shot': 25.214899713467048,
        'Tatabahasa 3-shots': 24.355300859598856,
    },
    {
        'model': '[malaysian-llama2-13b-32k-instructions](https://huggingface.co/mesolitica/malaysian-llama2-13b-32k-instructions)',
        'BM-PT3 0-shot': 28.57142857142857,
        'BM-PT3 1-shot': 12.244897959183673,
        'BM-PT3 3-shots': 17.307692307692307,
    },
    {
        'model': '[mistral-7b](https://huggingface.co/mistralai/Mistral-7B-v0.1)',
        'Tatabahasa 0-shot': 28.939828080229223,
        'Tatabahasa 1-shot': 34.38395415472779,
        'Tatabahasa 3-shots': 32.95128939828081,
    },
    {
        'model': '[malaysian-mistral-7b-4k](https://huggingface.co/mesolitica/mistral-7b-4096-fpf)',
        'BM-PT3 0-shot': 20.37037037037037,
        'BM-PT3 1-shot': 22.22222222222222,
        'BM-PT3 3-shots': 33.33333333333333,
        'Tatabahasa 0-shot': 21.48997134670487,
        'Tatabahasa 1-shot': 28.939828080229223,
        'Tatabahasa 3-shots': 24.641833810888254,
    },
    {
        'model': '[malaysian-mistral-7b-32k](https://huggingface.co/mesolitica/mistral-7b-32768-fpf)',
        'BM-PT3 0-shot': 16.666666666666664,
        'BM-PT3 1-shot': 16.666666666666664,
        'BM-PT3 3-shots': 25.925925925925924,
        'Tatabahasa 0-shot': 18.624641833810887,
        'Tatabahasa 1-shot': 24.355300859598856,
        'Tatabahasa 3-shots': 28.653295128939828,
    },
    {
        'model': '[malaysian-mistral-7b-32k-instructions](https://huggingface.co/mesolitica/malaysian-mistral-7b-32k-instructions)',
        'BM-PT3 0-shot': 21.568627450980394,
        'BM-PT3 1-shot': 31.25,
        'BM-PT3 3-shots': 28.000000000000004,
    },
    {
        'model': '[aisingapore/sealion3b](https://huggingface.co/aisingapore/sealion3b)',
        'BM-PT3 0-shot': 20.37037037037037,
        'BM-PT3 1-shot': 25.925925925925924,
        'BM-PT3 3-shots': 31.48148148148148,
        'Tatabahasa 0-shot': 21.776504297994272,
        'Tatabahasa 1-shot': 21.776504297994272,
        'Tatabahasa 3-shots': 24.641833810888254,
    },
    {
        'model': '[aisingapore/sealion7b](https://huggingface.co/aisingapore/sealion7b)',
        'BM-PT3 0-shot': 20.37037037037037,
        'BM-PT3 1-shot': 24.074074074074073,
        'BM-PT3 3-shots': 33.33333333333333,
        'Tatabahasa 0-shot': 25.787965616045845,
        'Tatabahasa 1-shot': 27.507163323782237,
        'Tatabahasa 3-shots': 26.07449856733524,
    }
]

data = pd.DataFrame(data)

with demo:
    gr.HTML(TITLE)
    gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
    gr.DataFrame(data, datatype = 'markdown')

demo.launch()