import gradio as gr import pandas as pd from css_html_js import custom_css TITLE = """

🇲🇾 Malay LLM Leaderboard

""" INTRODUCTION_TEXT = """ 📐 The 🇲🇾 Malay LLM Leaderboard aims to track, rank and evaluate open LLMs on Malay tasks. All notebooks at https://github.com/mesolitica/llm-benchmarks, feel free to submit your own score at https://huggingface.co/spaces/mesolitica/malay-llm-leaderboard/discussions with link to the notebook. ## Dataset 📈 We evaluate models based on 3 datasets, 1. BM-PT3 Paper 1, contains 54 questions, https://github.com/mesolitica/malaysian-dataset/tree/master/llm-benchmark/BM-pt3 - This test is for 15 years old Malaysia student, it is about reading comprehension and general knowledge for malay language. 2. Tatabahasa, contains 349 questions, https://github.com/mesolitica/malaysian-dataset/tree/master/llm-benchmark/tatabahasabm.tripod.com - This test is general test for malay grammar. 3. HumanEval, https://github.com/openai/human-eval - This test is for programming language understanding. """ not_verify = [ { 'model': 'Antrophic Claude 2', 'Tatabahasa 0-shot': 61, 'Tatabahasa 3-shots': 57.8, }, { 'model': 'Antrophic Claude 1', 'Tatabahasa 3-shots': 67, }, ] close_source = [ { 'model': 'gpt-4-1106-preview', 'BM-PT3 0-shot': 51.85185185185185, 'BM-PT3 1-shot': 66.66666666666666, 'BM-PT3 3-shots': 55.55555555555556, 'Tatabahasa 0-shot': 75.64469914040114, 'Tatabahasa 1-shot': 73.63896848137536, 'Tatabahasa 3-shots': 75.64469914040114, }, { 'model': 'gpt-3.5-turbo-0613', 'BM-PT3 0-shot': 36.53846153846153, 'BM-PT3 1-shot': 28.846153846153843, 'BM-PT3 3-shots': 24.528301886792452, 'Tatabahasa 0-shot': 59.530791788856305, 'Tatabahasa 1-shot': 60.80691642651297, 'Tatabahasa 3-shots': 63.03724928366762, }, ] open_source = [ { 'model': '[llama2-7b](https://huggingface.co/meta-llama/Llama-2-7b-hf)', 'Tatabahasa 0-shot': 24.355300859598856, 'Tatabahasa 1-shot': 28.08022922636103, 'Tatabahasa 3-shots': 24.641833810888254, }, { 'model': '[malaysian-llama2-7b-32k](https://huggingface.co/mesolitica/llama-7b-hf-32768-fpf)', 'BM-PT3 0-shot': 20.37037037037037, 'BM-PT3 1-shot': 20.37037037037037, 'BM-PT3 3-shots': 29.629629629629626, 'Tatabahasa 0-shot': 17.765042979942695, 'Tatabahasa 1-shot': 24.068767908309454, 'Tatabahasa 3-shots': 27.507163323782237, }, { 'model': '[malaysian-llama2-7b-32k-instructions](https://huggingface.co/mesolitica/malaysian-llama2-7b-32k-instructions-v2)', 'BM-PT3 0-shot': 33.33333333333333, 'BM-PT3 1-shot': 37.03703703703704, 'BM-PT3 3-shots': 35.18518518518518, 'Tatabahasa 0-shot': 54.72779369627507, 'Tatabahasa 1-shot': 48.42406876790831, 'Tatabahasa 3-shots': 41.833810888252145, }, { 'model': '[malaysian-llama2-13b-32k](https://huggingface.co/mesolitica/llama-13b-hf-32768-fpf)', 'BM-PT3 0-shot': 33.33333333333333, 'BM-PT3 1-shot': 20.37037037037037, 'BM-PT3 3-shots': 31.48148148148148, 'Tatabahasa 0-shot': 26.07449856733524, 'Tatabahasa 1-shot': 25.214899713467048, 'Tatabahasa 3-shots': 24.355300859598856, }, { 'model': '[malaysian-llama2-13b-32k-instructions](https://huggingface.co/mesolitica/malaysian-llama2-13b-32k-instructions)', 'BM-PT3 0-shot': 28.57142857142857, 'BM-PT3 1-shot': 12.244897959183673, 'BM-PT3 3-shots': 17.307692307692307, }, { 'model': '[mistral-7b](https://huggingface.co/mistralai/Mistral-7B-v0.1)', 'Tatabahasa 0-shot': 28.939828080229223, 'Tatabahasa 1-shot': 34.38395415472779, 'Tatabahasa 3-shots': 32.95128939828081, }, { 'model': '[malaysian-mistral-7b-4k](https://huggingface.co/mesolitica/mistral-7b-4096-fpf)', 'BM-PT3 0-shot': 20.37037037037037, 'BM-PT3 1-shot': 22.22222222222222, 'BM-PT3 3-shots': 33.33333333333333, 'Tatabahasa 0-shot': 21.48997134670487, 'Tatabahasa 1-shot': 28.939828080229223, 'Tatabahasa 3-shots': 24.641833810888254, }, { 'model': '[malaysian-mistral-7b-32k](https://huggingface.co/mesolitica/mistral-7b-32768-fpf)', 'BM-PT3 0-shot': 16.666666666666664, 'BM-PT3 1-shot': 16.666666666666664, 'BM-PT3 3-shots': 25.925925925925924, 'Tatabahasa 0-shot': 18.624641833810887, 'Tatabahasa 1-shot': 24.355300859598856, 'Tatabahasa 3-shots': 28.653295128939828, }, { 'model': '[malaysian-mistral-7b-32k-instructions](https://huggingface.co/mesolitica/malaysian-mistral-7b-32k-instructions)', 'BM-PT3 0-shot': 40.74074074074074, 'BM-PT3 1-shot': 31.48148148148148, 'BM-PT3 3-shots': 24.074074074074073, 'Tatabahasa 0-shot': 57.879656160458445, 'Tatabahasa 1-shot': 49.28366762177651, 'Tatabahasa 3-shots': 53.86819484240688 }, { 'model': '[aisingapore/sealion3b](https://huggingface.co/aisingapore/sealion3b)', 'BM-PT3 0-shot': 20.37037037037037, 'BM-PT3 1-shot': 25.925925925925924, 'BM-PT3 3-shots': 31.48148148148148, 'Tatabahasa 0-shot': 21.776504297994272, 'Tatabahasa 1-shot': 21.776504297994272, 'Tatabahasa 3-shots': 24.641833810888254, }, { 'model': '[aisingapore/sealion7b](https://huggingface.co/aisingapore/sealion7b)', 'BM-PT3 0-shot': 20.37037037037037, 'BM-PT3 1-shot': 24.074074074074073, 'BM-PT3 3-shots': 33.33333333333333, 'Tatabahasa 0-shot': 25.787965616045845, 'Tatabahasa 1-shot': 27.507163323782237, 'Tatabahasa 3-shots': 26.07449856733524, } ] data = pd.DataFrame(close_source + open_source) demo = gr.Blocks(css=custom_css) with demo: gr.HTML(TITLE) gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text") gr.DataFrame(data, datatype = 'markdown') demo.launch()