from dataclasses import dataclass
from enum import Enum
@dataclass
class Task:
benchmark: str
metric: str
col_name: str
# Select your tasks here
# ---------------------------------------------------
class Tasks(Enum):
# # task_key in the json file, metric_key in the json file, name to display in the leaderboard
# task0 = Task("anli_r1", "acc", "ANLI")
# task1 = Task("logiqa", "acc_norm", "LogiQA")
acva = Task("community|acva:_average|0", "acc_norm", "ACVA")
alghafa = Task("community|alghafa:_average|0", "acc_norm", "AlGhafa")
arabic_mmlu = Task("community|arabic_mmlu:_average|0", "acc_norm", "MMLU")
arabic_exams = Task("community|arabic_exams|0", "acc_norm", "EXAMS")
arc_challenge_okapi_ar = Task("community|arc_challenge_okapi_ar|0", "acc_norm", "ARC Challenge")
arc_easy_ar = Task("community|arc_easy_ar|0", "acc_norm", "ARC Easy")
boolq_ar = Task("community|boolq_ar|0", "acc_norm", "BOOLQ")
copa_ext_ar = Task("community|copa_ext_ar|0", "acc_norm", "COPA")
hellaswag_okapi_ar = Task("community|hellaswag_okapi_ar|0", "acc_norm", "HELLASWAG")
openbook_qa_ext_ar = Task("community|openbook_qa_ext_ar|0", "acc_norm", "OPENBOOK QA")
piqa_ar = Task("community|piqa_ar|0", "acc_norm", "PIQA")
race_ar = Task("community|race_ar|0", "acc_norm", "RACE")
sciq_ar = Task("community|sciq_ar|0", "acc_norm", "SCIQ")
toxigen_ar = Task("community|toxigen_ar|0", "acc_norm", "TOXIGEN")
NUM_FEWSHOT = 0 # Change with your few shot
# ---------------------------------------------------
# Your leaderboard name
# TITLE = """
Open Arabic LLM Leaderboard
"""
TITLE = """"""
BOTTOM_LOGO = """"""
# What does your leaderboard evaluate?
INTRODUCTION_TEXT = """
🌴 The Open Arabic LLM Leaderboard : Evaluate and compare the performance of Arabic Large Language Models (LLMs).
When you submit a model on the "Submit here!" page, it is automatically evaluated on a set of benchmarks.
The GPU used for evaluation is operated with the support of __[Technology Innovation Institute (TII)](https://www.tii.ae/)__.
The datasets used for evaluation consist of datasets that are Arabic Native like the `AlGhafa` benchmark from [TII](https://www.tii.ae/) and `ACVA` benchmark from [FreedomIntelligence](https://huggingface.co/FreedomIntelligence) to assess reasoning, language understanding, commonsense, and more.
More details about the benchmarks and the evaluation process is provided on the “About” page.
"""
# Which evaluations are you running? how can people reproduce what you have?
LLM_BENCHMARKS_TEXT = f"""
# Context
While outstanding LLM models are being released competitively, most of them are centered on English and are familiar with the English cultural sphere. We operate the Open Arabic LLM Leaderboard (OALL), to evaluate models that reflect the characteristics of the Arabic language, culture and heritage. Through this, we hope that users can conveniently use the leaderboard, participate, and contribute to the advancement of research in the Arab region 🔥.
## Icons & Model types
🟢 : `pretrained` or `continuously pretrained`
🔶 : `fine-tuned on domain-specific datasets`
💬 : `chat models (RLHF, DPO, ORPO, ...)`
🤝 : `base merges and moerges`
If the icon is "?", it indicates that there is insufficient information about the model.
Please provide information about the model through an issue! 🤩
Note 1 : We reserve the right to correct any incorrect tags/icons after manual verification to ensure the accuracy and reliability of the leaderboard.
Note 2 ⚠️ : Some models might be widely discussed as subjects of caution by the community, implying that users should exercise restraint when using them. Models that have used the evaluation set for training to achieve a high leaderboard ranking, among others, may be selected as subjects of caution and might result in their deletion from the leaderboard.
## How it works
📈 We evaluate models using the impressive [LightEval](https://github.com/huggingface/lighteval), a unified and straightforward framework from the HuggingFace Eval Team to test and assess causal language models on a large number of different evaluation tasks.
We have set up a benchmark using datasets, most of them translated to Arabic, and validated by native Arabic speakers. We also added `AlGhafa`, a new benchmark prepared from scratch natively for Arabic, alongside the `ACVA` benchmark introduced in the [AceGPT](https://arxiv.org/abs/2309.12053) paper by [FreedomIntelligence](https://huggingface.co/FreedomIntelligence).
Find below the Native benchmarks :
- AlGhafa : Find more details [here](https://aclanthology.org/2023.arabicnlp-1.21.pdf) - (provided by [TII](https://www.tii.ae/))
- Arabic-Culture-Value-Alignment (ACVA) : Find more details [here](https://arxiv.org/pdf/2309.12053) - (provided by [FreedomIntelligence](https://huggingface.co/FreedomIntelligence))
And here find all the translated benchmarks provided by the Language evaluation team at [Technology Innovation Institute](https://www.tii.ae/) :
- `Arabic-MMLU`, `Arabic-EXAMS`, `Arabic-ARC-Challenge`, `Arabic-ARC-Easy`, `Arabic-BOOLQ`, `Arabic-COPA`, `Arabic-HELLASWAG`, `Arabic-OPENBOOK-QA`, `Arabic-PIQA`, `Arabic-RACE`, `Arabic-SCIQ`, `Arabic-TOXIGEN`. All part of the extended version of the AlGhafa benchmark (AlGhafa-T version)
To ensure a fair and unbiased assessment of the models' true capabilities, all evaluations are conducted in zero-shot settings `0-shots`. This approach eliminates any potential advantage from task-specific fine-tuning, providing a clear indication of how well the models can generalize to new tasks.
Also, given the nature of the tasks, which include multiple-choice and yes/no questions, the leaderboard primarily uses normalized log likelihood accuracy `loglikelihood_acc_norm` for all tasks. This metric was chosen for its ability to provide a clear and fair measurement of model performance across different types of questions.
Please, consider reaching out to us through the discussions tab if you are working on benchmarks for Arabic LLMs and willing to see them on this leaderboard as well. Your benchmark might change the whole game for Arabic models !
GPUs are provided by __[Technology Innovation Institute (TII)](https://www.tii.ae/)__ for the evaluations.
## Details and Logs
- Detailed numerical results in the `results` OALL dataset: https://huggingface.co/datasets/OALL/results
- Community queries and running status in the `requests` OALL dataset: https://huggingface.co/datasets/OALL/requests
## More resources
If you still have questions, you can check our FAQ [here](https://huggingface.co/spaces/OALL/Open-Arabic-LLM-Leaderboard/discussions/1)!
"""
EVALUATION_QUEUE_TEXT = """
## Some good practices before submitting a model
### 1) Make sure you can load your model and tokenizer using AutoClasses:
```python
from transformers import AutoConfig, AutoModel, AutoTokenizer
config = AutoConfig.from_pretrained("your model name", revision=revision)
model = AutoModel.from_pretrained("your model name", revision=revision)
tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
```
If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded.
Note: make sure your model is public!
Note: if your model needs `use_remote_code=True`, we do not support this option yet but we are working on adding it, stay posted!
### 2) Convert your model weights to [safetensors](https://huggingface.co/docs/safetensors/index)
It's a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of parameters of your model to the `Extended Viewer`!
### 3) Make sure your model has an open license!
This is a leaderboard for Open LLMs, and we'd love for as many people as possible to know they can use your model 🤗
### 4) Fill up your model card
When we add extra information about models to the leaderboard, it will be automatically taken from the model card
## In case of model failure
If your model is displayed in the `FAILED` category, its execution stopped.
Make sure you have followed the above steps first.
If everything is done, check you can launch the LightEval script on your model locally, using [this script](https://gist.github.com/alielfilali01/d486cfc962dca3ed4091b7c562a4377f).
"""
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
CITATION_BUTTON_TEXT = r"""
@misc{OALL,
author = {Elfilali, Ali and Alobeidli, Hamza and Fourrier, Clémentine and Boussaha, Basma El Amel and Cojocaru, Ruxandra and Habib, Nathan and Hacid, Hakim},
title = {Open Arabic LLM Leaderboard},
year = {2024},
publisher = {OALL},
howpublished = "\url{https://huggingface.co/spaces/OALL/Open-Arabic-LLM-Leaderboard}"
}
@inproceedings{almazrouei-etal-2023-alghafa,
title = "{A}l{G}hafa Evaluation Benchmark for {A}rabic Language Models",
author = "Almazrouei, Ebtesam and
Cojocaru, Ruxandra and
Baldo, Michele and
Malartic, Quentin and
Alobeidli, Hamza and
Mazzotta, Daniele and
Penedo, Guilherme and
Campesan, Giulia and
Farooq, Mugariya and
Alhammadi, Maitha and
Launay, Julien and
Noune, Badreddine",
editor = "Sawaf, Hassan and
El-Beltagy, Samhaa and
Zaghouani, Wajdi and
Magdy, Walid and
Abdelali, Ahmed and
Tomeh, Nadi and
Abu Farha, Ibrahim and
Habash, Nizar and
Khalifa, Salam and
Keleg, Amr and
Haddad, Hatem and
Zitouni, Imed and
Mrini, Khalil and
Almatham, Rawan",
booktitle = "Proceedings of ArabicNLP 2023",
month = dec,
year = "2023",
address = "Singapore (Hybrid)",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.arabicnlp-1.21",
doi = "10.18653/v1/2023.arabicnlp-1.21",
pages = "244--275",
abstract = "Recent advances in the space of Arabic large language models have opened up a wealth of potential practical applications. From optimal training strategies, large scale data acquisition and continuously increasing NLP resources, the Arabic LLM landscape has improved in a very short span of time, despite being plagued by training data scarcity and limited evaluation resources compared to English. In line with contributing towards this ever-growing field, we introduce AlGhafa, a new multiple-choice evaluation benchmark for Arabic LLMs. For showcasing purposes, we train a new suite of models, including a 14 billion parameter model, the largest monolingual Arabic decoder-only model to date. We use a collection of publicly available datasets, as well as a newly introduced HandMade dataset consisting of 8 billion tokens. Finally, we explore the quantitative and qualitative toxicity of several Arabic models, comparing our models to existing public Arabic LLMs.",
}
@misc{huang2023acegpt,
title={AceGPT, Localizing Large Language Models in Arabic},
author={Huang Huang and Fei Yu and Jianqing Zhu and Xuening Sun and Hao Cheng and Dingjie Song and Zhihong Chen and Abdulmohsen Alharthi and Bang An and Ziche Liu and Zhiyi Zhang and Junying Chen and Jianquan Li and Benyou Wang and Lian Zhang and Ruoyu Sun and Xiang Wan and Haizhou Li and Jinchao Xu},
year={2023},
eprint={2309.12053},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
@misc{lighteval,
author = {Fourrier, Clémentine and Habib, Nathan and Wolf, Thomas and Tunstall, Lewis},
title = {LightEval: A lightweight framework for LLM evaluation},
year = {2023},
version = {0.3.0},
url = {https://github.com/huggingface/lighteval}
}"""