huseinzol05
commited on
Commit
β’
7963463
1
Parent(s):
958587e
fix app
Browse files
app.py
CHANGED
@@ -1,152 +1,158 @@
|
|
1 |
-
import gradio as gr
|
2 |
-
import pandas as pd
|
3 |
-
from css_html_js import custom_css
|
4 |
|
5 |
-
TITLE = """<h1 align="center" id="space-title">π²πΎ Malay LLM Leaderboard</h1>"""
|
6 |
|
7 |
-
INTRODUCTION_TEXT = """
|
8 |
-
π The π²πΎ Malay LLM Leaderboard aims to track, rank and evaluate open LLMs on Malay tasks. All notebooks at https://github.com/mesolitica/llm-benchmarks, feel free to submit your own score at https://huggingface.co/spaces/mesolitica/malay-llm-leaderboard/discussions with link to the notebook.
|
9 |
|
10 |
-
## Dataset
|
11 |
|
12 |
-
π We evaluate models based on 3 datasets,
|
13 |
|
14 |
-
1. BM-PT3 Paper 1, contains 54 questions, https://github.com/mesolitica/malaysian-dataset/tree/master/llm-benchmark/BM-pt3
|
15 |
-
- This test is for 15 years old Malaysia student, it is about reading comprehension and general knowledge for malay language.
|
16 |
-
2. Tatabahasa, contains 349 questions, https://github.com/mesolitica/malaysian-dataset/tree/master/llm-benchmark/tatabahasabm.tripod.com
|
17 |
-
- This test is general test for malay grammar.
|
18 |
-
3. Translated IndoNLI to Malay, tested on `test_expert` dataset, https://huggingface.co/datasets/mesolitica/translated-indonli
|
19 |
-
- This test is general test to language reasoning.
|
20 |
-
4. HumanEval, https://github.com/openai/human-eval
|
21 |
-
- This test is for programming language understanding.
|
22 |
-
"""
|
23 |
|
24 |
-
close_source = [
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
]
|
53 |
|
54 |
-
open_source = [
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
]
|
143 |
|
144 |
-
data = pd.DataFrame(close_source + open_source)
|
145 |
|
146 |
-
demo = gr.Blocks(css=custom_css)
|
147 |
-
with demo:
|
148 |
-
|
149 |
-
|
150 |
-
|
|
|
|
|
151 |
|
|
|
|
|
|
|
|
|
152 |
demo.launch()
|
|
|
1 |
+
# import gradio as gr
|
2 |
+
# import pandas as pd
|
3 |
+
# from css_html_js import custom_css
|
4 |
|
5 |
+
# TITLE = """<h1 align="center" id="space-title">π²πΎ Malay LLM Leaderboard</h1>"""
|
6 |
|
7 |
+
# INTRODUCTION_TEXT = """
|
8 |
+
# π The π²πΎ Malay LLM Leaderboard aims to track, rank and evaluate open LLMs on Malay tasks. All notebooks at https://github.com/mesolitica/llm-benchmarks, feel free to submit your own score at https://huggingface.co/spaces/mesolitica/malay-llm-leaderboard/discussions with link to the notebook.
|
9 |
|
10 |
+
# ## Dataset
|
11 |
|
12 |
+
# π We evaluate models based on 3 datasets,
|
13 |
|
14 |
+
# 1. BM-PT3 Paper 1, contains 54 questions, https://github.com/mesolitica/malaysian-dataset/tree/master/llm-benchmark/BM-pt3
|
15 |
+
# - This test is for 15 years old Malaysia student, it is about reading comprehension and general knowledge for malay language.
|
16 |
+
# 2. Tatabahasa, contains 349 questions, https://github.com/mesolitica/malaysian-dataset/tree/master/llm-benchmark/tatabahasabm.tripod.com
|
17 |
+
# - This test is general test for malay grammar.
|
18 |
+
# 3. Translated IndoNLI to Malay, tested on `test_expert` dataset, https://huggingface.co/datasets/mesolitica/translated-indonli
|
19 |
+
# - This test is general test to language reasoning.
|
20 |
+
# 4. HumanEval, https://github.com/openai/human-eval
|
21 |
+
# - This test is for programming language understanding.
|
22 |
+
# """
|
23 |
|
24 |
+
# close_source = [
|
25 |
+
# {
|
26 |
+
# 'model': 'gpt-4-1106-preview',
|
27 |
+
# 'BM-PT3 0-shot': 51.85185185185185,
|
28 |
+
# 'BM-PT3 1-shot': 66.66666666666666,
|
29 |
+
# 'BM-PT3 3-shots': 55.55555555555556,
|
30 |
+
# 'Tatabahasa 0-shot': 75.64469914040114,
|
31 |
+
# 'Tatabahasa 1-shot': 73.63896848137536,
|
32 |
+
# 'Tatabahasa 3-shots': 75.64469914040114,
|
33 |
+
# },
|
34 |
+
# {
|
35 |
+
# 'model': 'gpt-3.5-turbo-0613',
|
36 |
+
# 'BM-PT3 0-shot': 36.53846153846153,
|
37 |
+
# 'BM-PT3 1-shot': 28.846153846153843,
|
38 |
+
# 'BM-PT3 3-shots': 24.528301886792452,
|
39 |
+
# 'Tatabahasa 0-shot': 59.530791788856305,
|
40 |
+
# 'Tatabahasa 1-shot': 60.80691642651297,
|
41 |
+
# 'Tatabahasa 3-shots': 63.03724928366762,
|
42 |
+
# },
|
43 |
+
# {
|
44 |
+
# 'model': 'Antrophic Claude 2',
|
45 |
+
# 'Tatabahasa 0-shot': 61,
|
46 |
+
# 'Tatabahasa 3-shots': 57.8,
|
47 |
+
# },
|
48 |
+
# {
|
49 |
+
# 'model': 'Antrophic Claude 1',
|
50 |
+
# 'Tatabahasa 3-shots': 67,
|
51 |
+
# },
|
52 |
+
# ]
|
53 |
|
54 |
+
# open_source = [
|
55 |
+
# {
|
56 |
+
# 'model': '[llama2-7b](https://huggingface.co/meta-llama/Llama-2-7b-hf)',
|
57 |
+
# 'Tatabahasa 0-shot': 24.355300859598856,
|
58 |
+
# 'Tatabahasa 1-shot': 28.08022922636103,
|
59 |
+
# 'Tatabahasa 3-shots': 24.641833810888254,
|
60 |
+
# },
|
61 |
+
# {
|
62 |
+
# 'model': '[malaysian-llama2-7b-32k](https://huggingface.co/mesolitica/llama-7b-hf-32768-fpf)',
|
63 |
+
# 'BM-PT3 0-shot': 20.37037037037037,
|
64 |
+
# 'BM-PT3 1-shot': 20.37037037037037,
|
65 |
+
# 'BM-PT3 3-shots': 29.629629629629626,
|
66 |
+
# 'Tatabahasa 0-shot': 17.765042979942695,
|
67 |
+
# 'Tatabahasa 1-shot': 24.068767908309454,
|
68 |
+
# 'Tatabahasa 3-shots': 27.507163323782237,
|
69 |
+
# },
|
70 |
+
# {
|
71 |
+
# 'model': '[malaysian-llama2-7b-32k-instructions](https://huggingface.co/mesolitica/malaysian-llama2-7b-32k-instructions)',
|
72 |
+
# 'BM-PT3 0-shot': 35.294117647058826,
|
73 |
+
# 'BM-PT3 1-shot': 21.153846153846153,
|
74 |
+
# 'BM-PT3 3-shots': 28.30188679245283,
|
75 |
+
# },
|
76 |
+
# {
|
77 |
+
# 'model': '[malaysian-llama2-13b-32k](https://huggingface.co/mesolitica/llama-13b-hf-32768-fpf)',
|
78 |
+
# 'BM-PT3 0-shot': 33.33333333333333,
|
79 |
+
# 'BM-PT3 1-shot': 20.37037037037037,
|
80 |
+
# 'BM-PT3 3-shots': 31.48148148148148,
|
81 |
+
# 'Tatabahasa 0-shot': 26.07449856733524,
|
82 |
+
# 'Tatabahasa 1-shot': 25.214899713467048,
|
83 |
+
# 'Tatabahasa 3-shots': 24.355300859598856,
|
84 |
+
# },
|
85 |
+
# {
|
86 |
+
# 'model': '[malaysian-llama2-13b-32k-instructions](https://huggingface.co/mesolitica/malaysian-llama2-13b-32k-instructions)',
|
87 |
+
# 'BM-PT3 0-shot': 28.57142857142857,
|
88 |
+
# 'BM-PT3 1-shot': 12.244897959183673,
|
89 |
+
# 'BM-PT3 3-shots': 17.307692307692307,
|
90 |
+
# },
|
91 |
+
# {
|
92 |
+
# 'model': '[mistral-7b](https://huggingface.co/mistralai/Mistral-7B-v0.1)',
|
93 |
+
# 'Tatabahasa 0-shot': 28.939828080229223,
|
94 |
+
# 'Tatabahasa 1-shot': 34.38395415472779,
|
95 |
+
# 'Tatabahasa 3-shots': 32.95128939828081,
|
96 |
+
# },
|
97 |
+
# {
|
98 |
+
# 'model': '[malaysian-mistral-7b-4k](https://huggingface.co/mesolitica/mistral-7b-4096-fpf)',
|
99 |
+
# 'BM-PT3 0-shot': 20.37037037037037,
|
100 |
+
# 'BM-PT3 1-shot': 22.22222222222222,
|
101 |
+
# 'BM-PT3 3-shots': 33.33333333333333,
|
102 |
+
# 'Tatabahasa 0-shot': 21.48997134670487,
|
103 |
+
# 'Tatabahasa 1-shot': 28.939828080229223,
|
104 |
+
# 'Tatabahasa 3-shots': 24.641833810888254,
|
105 |
+
# },
|
106 |
+
# {
|
107 |
+
# 'model': '[malaysian-mistral-7b-32k](https://huggingface.co/mesolitica/mistral-7b-32768-fpf)',
|
108 |
+
# 'BM-PT3 0-shot': 16.666666666666664,
|
109 |
+
# 'BM-PT3 1-shot': 16.666666666666664,
|
110 |
+
# 'BM-PT3 3-shots': 25.925925925925924,
|
111 |
+
# 'Tatabahasa 0-shot': 18.624641833810887,
|
112 |
+
# 'Tatabahasa 1-shot': 24.355300859598856,
|
113 |
+
# 'Tatabahasa 3-shots': 28.653295128939828,
|
114 |
+
# },
|
115 |
+
# {
|
116 |
+
# 'model': '[malaysian-mistral-7b-32k-instructions](https://huggingface.co/mesolitica/malaysian-mistral-7b-32k-instructions)',
|
117 |
+
# 'BM-PT3 0-shot': 35.18518518518518,
|
118 |
+
# 'BM-PT3 1-shot': 33.33333333333333,
|
119 |
+
# 'BM-PT3 3-shots': 37.03703703703704,
|
120 |
+
# 'Tatabahasa 0-shot': 55.014326647564474,
|
121 |
+
# 'Tatabahasa 1-shot': 42.693409742120345,
|
122 |
+
# 'Tatabahasa 3-shots': 33.33333333333333,
|
123 |
+
# },
|
124 |
+
# {
|
125 |
+
# 'model': '[aisingapore/sealion3b](https://huggingface.co/aisingapore/sealion3b)',
|
126 |
+
# 'BM-PT3 0-shot': 20.37037037037037,
|
127 |
+
# 'BM-PT3 1-shot': 25.925925925925924,
|
128 |
+
# 'BM-PT3 3-shots': 31.48148148148148,
|
129 |
+
# 'Tatabahasa 0-shot': 21.776504297994272,
|
130 |
+
# 'Tatabahasa 1-shot': 21.776504297994272,
|
131 |
+
# 'Tatabahasa 3-shots': 24.641833810888254,
|
132 |
+
# },
|
133 |
+
# {
|
134 |
+
# 'model': '[aisingapore/sealion7b](https://huggingface.co/aisingapore/sealion7b)',
|
135 |
+
# 'BM-PT3 0-shot': 20.37037037037037,
|
136 |
+
# 'BM-PT3 1-shot': 24.074074074074073,
|
137 |
+
# 'BM-PT3 3-shots': 33.33333333333333,
|
138 |
+
# 'Tatabahasa 0-shot': 25.787965616045845,
|
139 |
+
# 'Tatabahasa 1-shot': 27.507163323782237,
|
140 |
+
# 'Tatabahasa 3-shots': 26.07449856733524,
|
141 |
+
# }
|
142 |
+
# ]
|
143 |
|
144 |
+
# data = pd.DataFrame(close_source + open_source)
|
145 |
|
146 |
+
# demo = gr.Blocks(css=custom_css)
|
147 |
+
# with demo:
|
148 |
+
# gr.HTML(TITLE)
|
149 |
+
# gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
150 |
+
# gr.DataFrame(data, datatype = 'markdown')
|
151 |
+
|
152 |
+
# demo.launch()
|
153 |
|
154 |
+
import gradio as gr
|
155 |
+
demo = gr.Blocks()
|
156 |
+
with demo:
|
157 |
+
gr.HTML('helo')
|
158 |
demo.launch()
|