persian_llm_leaderboard

Running

File size: 4,023 Bytes

efeee6d
314f91a
c7e2754
 
95f85ed
efeee6d
 
 
 
 
 
314f91a
b899767
 
efeee6d
943f952
aa6880d
 
157c503
 
 
 
aa6880d
1ffc326
 
b899767
 
efeee6d
 
 
c7e2754
804f4a9
c7e2754
854db9a
58733e4
efeee6d
8c49cb6
75a13cb
 
0aced5f
37e3c44
 
75a13cb
37e3c44
0227006
 
efeee6d
0227006
d313dbd
 
 
9833cdb
d16cee2
d313dbd
 
8c49cb6
0aced5f
 
 
 
 
 
 
d313dbd
 
 
 
 
 
 
 
8c49cb6
b323764
d313dbd
 
 
 
 
 
 
b323764
d313dbd
 
 
 
8c49cb6
 
0aced5f
 
 
 
 
58733e4
2a73469
 
217b585
9833cdb

from dataclasses import dataclass
from enum import Enum
from src.envs import REPO_ID


@dataclass
class Task:
    benchmark: str
    metric: str
    col_name: str


# Select your tasks here
# ---------------------------------------------------
class Tasks(Enum):
    # task_key in the json file, metric_key in the json file, name to display in the leaderboard 
    task1 = Task("PeKA", "acc", "PeKA*")
    task2 = Task("PersBETS", "acc", "PersBETS*")
    task3 = Task("khayyam_challenge", "acc", "Khayyam Challenge")
    task4 = Task("parsinlu_mc", "acc", "ParsiNLU MCQA")
    task5 = Task("parsinlu_nli", "acc", "ParsiNLU NLI")
    task6 = Task("parsinlu_qqp", "acc", "ParsiNLU QQP")
    # task7 = Task("persian_ARC", "acc", "Persian ARC")

NUM_FEWSHOT = 0 # Change with your few shot
# ---------------------------------------------------



# Your leaderboard name
TITLE = f"""
<img src="https://huggingface.co/spaces/{REPO_ID}/resolve/main/banner_green.png" style="width:70%;display:block;margin-left:auto;margin-right:auto">
"""


# What does your leaderboard evaluate?
INTRODUCTION_TEXT = """
Persian LLM Leaderboard is designed to be a challenging benchmark and provide a reliable evaluation of LLMs in Persian Language. 

Note: This is a demo version of the leaderboard. Two new benchmarks are introduced: *PeKA* and *PersBETS*, challenging the native knowledge of the models along with 
linguistic skills and their level of bias, ethics, and trustworthiness. **These datasets are not yet public, but they will be uploaded onto huggingface along with a detailed paper
explaining the data and performance of relevent models.**

Note: **We plan to release an evaluation framework soon in which the details and methods of evaluation are specified.**
"""

# Which evaluations are you running? how can people reproduce what you have?
LLM_BENCHMARKS_TEXT = f"""
## How it works

## Reproducibility
To reproduce our results, here is the commands you can run:

"""

EVALUATION_QUEUE_TEXT = """

Right now, the models added **are not automatically evaluated**. We may support automatic evaluation in the future on our own clusters.
An evaluation framework will be available in the future to help reproduce the results.

## Don't forget to read the FAQ and the About tabs for more information!

## First steps before submitting a model

### 1) Make sure you can load your model and tokenizer using AutoClasses:
```python
from transformers import AutoConfig, AutoModel, AutoTokenizer
config = AutoConfig.from_pretrained("your model name", revision=revision)
model = AutoModel.from_pretrained("your model name", revision=revision)
tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
```
If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded.

Note: make sure your model is public!

### 2) Convert your model weights to [safetensors](https://huggingface.co/docs/safetensors/index)
It's a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of parameters of your model to the `Extended Viewer`!

### 3) Make sure your model has an open license!
This is a leaderboard for Open LLMs, and we'd love for as many people as possible to know they can use your model 🤗

### 4) Fill up your model card
When we add extra information about models to the leaderboard, it will be automatically taken from the model card

## In case of model failure
If your model is displayed in the `FAILED` category, its execution stopped.
Make sure you have followed the above steps first.

### 5) Select the correct precision
Not all models are converted properly from `float16` to `bfloat16`, and selecting the wrong precision can sometimes cause evaluation error (as loading a `bf16` model in `fp16` can sometimes generate NaNs, depending on the weight range).


"""

CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
CITATION_BUTTON_TEXT = r"""
"""