from dataclasses import dataclass from enum import Enum from src.envs import REPO_ID @dataclass class Task: benchmark: str metric: str col_name: str # Select your tasks here # --------------------------------------------------- class Tasks(Enum): # task_key in the json file, metric_key in the json file, name to display in the leaderboard task1 = Task("PeKA", "acc", "PeKA*") task2 = Task("PersBETS", "acc", "PersBETS*") task3 = Task("khayyam_challenge", "acc", "Khayyam Challenge") task4 = Task("parsinlu_mc", "acc", "ParsiNLU MCQA") task5 = Task("parsinlu_nli", "acc", "ParsiNLU NLI") task6 = Task("parsinlu_qqp", "acc", "ParsiNLU QQP") # task7 = Task("persian_ARC", "acc", "Persian ARC") NUM_FEWSHOT = 0 # Change with your few shot # --------------------------------------------------- # Your leaderboard name TITLE = f""" """ # What does your leaderboard evaluate? INTRODUCTION_TEXT = """ Persian LLM Leaderboard is designed to be a challenging benchmark and provide a reliable evaluation of LLMs in Persian Language. Note: This is a demo version of the leaderboard. Two new benchmarks are introduced: *PeKA* and *PersBETS*, challenging the native knowledge of the models along with linguistic skills and their level of bias, ethics, and trustworthiness. **These datasets are not yet public, but they will be uploaded onto huggingface along with a detailed paper explaining the data and performance of relevent models.** Note: **We plan to release an evaluation framework soon in which the details and methods of evaluation are specified.** """ # Which evaluations are you running? how can people reproduce what you have? LLM_BENCHMARKS_TEXT = f""" ## How it works ## Reproducibility To reproduce our results, here is the commands you can run: """ EVALUATION_QUEUE_TEXT = """ Right now, the models added **are not automatically evaluated**. We may support automatic evaluation in the future on our own clusters. An evaluation framework will be available in the future to help reproduce the results. ## Don't forget to read the FAQ and the About tabs for more information! ## First steps before submitting a model ### 1) Make sure you can load your model and tokenizer using AutoClasses: ```python from transformers import AutoConfig, AutoModel, AutoTokenizer config = AutoConfig.from_pretrained("your model name", revision=revision) model = AutoModel.from_pretrained("your model name", revision=revision) tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision) ``` If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded. Note: make sure your model is public! ### 2) Convert your model weights to [safetensors](https://huggingface.co/docs/safetensors/index) It's a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of parameters of your model to the `Extended Viewer`! ### 3) Make sure your model has an open license! This is a leaderboard for Open LLMs, and we'd love for as many people as possible to know they can use your model 🤗 ### 4) Fill up your model card When we add extra information about models to the leaderboard, it will be automatically taken from the model card ## In case of model failure If your model is displayed in the `FAILED` category, its execution stopped. Make sure you have followed the above steps first. ### 5) Select the correct precision Not all models are converted properly from `float16` to `bfloat16`, and selecting the wrong precision can sometimes cause evaluation error (as loading a `bf16` model in `fp16` can sometimes generate NaNs, depending on the weight range). """ CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results" CITATION_BUTTON_TEXT = r""" """