Spaces:

openGPT-X
/

european-llm-leaderboard

Running

App Files Files Community

KlaudiaTH commited on Jul 5

Commit

2b62c4c

•

1 Parent(s): 6ee7d57

Release version of leaderboard implementation

Browse files

Files changed (10) hide show

.gitattributes +36 -0
.github/workflows/check_large_files-action.yml +16 -0
.github/workflows/push_to_hfspace-action.yml +21 -0
.gitignore +2 -0
README.md +49 -1
app.py +160 -0
core.py +235 -0
pyproject.toml +2 -0
requirements.txt +20 -0
style.py +16 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,36 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+small_merged_data.xlsx filter=lfs diff=lfs merge=lfs -text

.github/workflows/check_large_files-action.yml ADDED Viewed

	@@ -0,0 +1,16 @@

+name: Check file size
+on:               # or directly `on: [push]` to run the action on every push on any branch
+  pull_request:
+    branches: [main]
+  # to run this workflow manually from the Actions tab
+  workflow_dispatch:
+jobs:
+  sync-to-hub:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Check large files
+        uses: ActionsDesk/lfs-warning@v2.0
+        with:
+          filesizelimit: 10485760 # this is 10MB so we can sync to HF Spaces

.github/workflows/push_to_hfspace-action.yml ADDED Viewed

	@@ -0,0 +1,21 @@

+name: Sync to Hugging Face hub
+on:
+  push:
+    branches: [main]
+  # to run this workflow manually from the Actions tab
+  workflow_dispatch:
+jobs:
+  sync-to-hub:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+        with:
+          token: ${{ secrets.GITHUB_TOKEN }}
+          fetch-depth: 0
+          lfs: true
+      - name: Push to hub
+        env:
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+        run: git push https://gptxuser:$HF_TOKEN@huggingface.co/spaces/openGPT-X/leaderboard main

.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ .vscode/
2	+ __pycache__/

README.md CHANGED Viewed

	@@ -1 +1,49 @@
1	- # ~~leaderboard~~

+# New data model
+The new model is constructed by taking individual json files in data/new_eval, combining them together into
+a simple format, and from the combined df, we create individual files for each models.
+For the new eval runs which has to be appended, we first analyze the model associated with the json file
+produced from eval harness, select the corresponding model file to append, find the unique rows (unique configuration
+of model name, language, task group and few shot) in the json file, append if unique rows are not 0.
+---
+title: Leaderboard
+emoji: 👁
+colorFrom: blue
+colorTo: blue
+sdk: gradio
+sdk_version: 4.19.2
+app_file: app.py
+pinned: false
+license: unknown
+---
+# Introduction
+This is the OpenGPT-X mutlilingual leaderboard source code repository.
+The leaderboard aims to provied an overview of LLM performance over various languages.
+The basic task set consists of MMLU, ARC, HellaSwag, GSM8k, TruthfulQA and belebele.
+To make the results comparable to the Open LLM leaderboard (https://huggingface.co/open-llm-leaderboard) we selected the former five tasks based on our internal machine translations of the English base tasks, in addition to the high-quality multilingual benchmark belebele by Meta.
+# Usage
+The actually hosted leaderboard can be found under https://huggingface.co/spaces/openGPT-X/leaderboard.
+In order to extend its functionality please create a PR.
+# Adding new tasks
+In order to add new evaluation tasks proceed as follows:
+1. Add task information to `TASK_INFO` in `src/data.py`. It should be a dict mapping the task display name to the metric to be shown, as well as a dict containing mappings from two-letter language codes to the corresponding lm-eval-harness task selection string. See existing task information for reference.
+2. Add evaluation results as detailed below.
+# Adding new models
+It is possible to change the display name of a particular model.
+Simply add an entry to `_MODEL_NAMES` in `src/data.py`.
+# Adding evaluation results
+Copy the `.json`-output generated by the lm-eval-harness into `data`.

app.py ADDED Viewed

	@@ -0,0 +1,160 @@

+import gradio as gr
+import core as core
+from style import CSS, T_SYMBOLS, TITLE
+demo = gr.Blocks(css=CSS)
+with demo:
+    gr.HTML(TITLE)
+    gr.Markdown(
+        "This is a (WIP) collection of multilingual evaluation results obtained using our fork of the LM-evaluation-harness (https://github.com/OpenGPTX/lm-evaluation-harness), based on https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard.\
+                Note that currently, not all benchmarks are available in all languages, results are averaged over those languages under the selected ones for which the benchmark is available.",
+        elem_classes="markdown-text",
+    )
+    with gr.Column():
+        with gr.Row():
+            with gr.Column():
+                with gr.Row():
+                    search_bar = gr.Textbox(
+                        label="Search models",
+                        placeholder=" 🔍 Separate multiple queries with ';' and press ENTER...",
+                        show_label=True,
+                        elem_id="search-bar",
+                    )
+                    model_types = gr.CheckboxGroup(
+                        label="Select model type",
+                        choices=[
+                            (
+                                f"Pretrained {T_SYMBOLS['pretrained']}",
+                                T_SYMBOLS["pretrained"],
+                            ),
+                            (f"Chat {T_SYMBOLS['chat']}", T_SYMBOLS["chat"]),
+                        ],
+                        value=list(T_SYMBOLS.values()),
+                    )
+                with gr.Row():
+                    langs_bar = gr.CheckboxGroup(
+                        choices=core.languages_list,
+                        value=core.languages_list,
+                        label="Select languages to average over",
+                        elem_id="column-select",
+                        interactive=True,
+                        scale=6,
+                    )
+                    with gr.Column(scale=1):
+                        clear = gr.ClearButton(
+                            langs_bar,
+                            value="Deselect all languages",
+                            size="sm",
+                            scale=1,
+                        )
+                        select = gr.Button(
+                            value="Select all languages", size="sm", scale=1
+                        )
+                        def update_bar():
+                            langs_bar = gr.CheckboxGroup(
+                                choices=core.languages_list,
+                                value=core.languages_list,
+                                label="Select languages to average over",
+                                elem_id="column-select",
+                                interactive=True,
+                            )
+                            return langs_bar
+                        select.click(update_bar, inputs=[], outputs=langs_bar)
+                with gr.Row():
+                    acc_task_group_names = core.task_groups_with_task_type("accuracy")
+                    shown_tasks = gr.CheckboxGroup(
+                        choices=acc_task_group_names,
+                        value=acc_task_group_names,
+                        label="Select tasks to show",
+                        elem_id="column-select",
+                        interactive=True,
+                        scale=50,
+                    )
+                    fewshot = gr.Radio(
+                        choices=[("0-Shot", False), ("Few-shot", True)],
+                        value=True,
+                        label="Select evaluation type",
+                        interactive=True,
+                        scale=29,
+                    )
+                    fewshot.change(
+                        core.fix_zeroshot, [shown_tasks, fewshot], shown_tasks
+                    )
+                    clear = gr.ClearButton(
+                        shown_tasks, value="Deselect all tasks", size="sm", scale=21
+                    )
+        with gr.Tabs(elem_classes="tab-buttons") as tabs:
+            with gr.TabItem(
+                "🏅 LLM accuracy benchmark", elem_id="llm-benchmark-tab-table-acc", id=0
+            ) as acc:
+                leaderboard_table = gr.Dataframe()
+            with gr.TabItem(
+                "🌐 LLM translation benchmark",
+                elem_id="llm-benchmark-tab-table-misc",
+                id=1,
+            ) as misc:
+                leaderboard_table_misc = gr.Dataframe()
+            with gr.TabItem("Plots", elem_id="llm-plot-tab", id=2) as plot:
+                leaderboard_plot = gr.Plot(elem_id="plot")
+            acc.select(
+                lambda x: core.update_tab_tasks(0, x),
+                inputs=fewshot,
+                outputs=[shown_tasks, fewshot],
+            )
+            misc.select(
+                lambda x: core.update_tab_tasks(1, x),
+                inputs=fewshot,
+                outputs=[shown_tasks, fewshot],
+            )
+            for comp, fn in [
+                (search_bar, "submit"),
+                (langs_bar, "change"),
+                (shown_tasks, "change"),
+                (fewshot, "change"),
+                (model_types, "change"),
+            ]:
+                getattr(comp, fn)(
+                    core.update_df,
+                    [shown_tasks, search_bar, langs_bar, model_types, fewshot],
+                    leaderboard_table,
+                )
+                getattr(comp, fn)(
+                    core.update_df,
+                    [shown_tasks, search_bar, langs_bar, model_types, fewshot],
+                    leaderboard_table_misc,
+                )
+                getattr(comp, fn)(
+                    core.update_plot,
+                    [shown_tasks, search_bar, langs_bar, model_types, fewshot],
+                    leaderboard_plot,
+                )
+    gr.Blocks.load(
+        block=demo,
+        fn=core.update_df,
+        inputs=[shown_tasks, search_bar, langs_bar, model_types, fewshot],
+        outputs=leaderboard_table,
+    )
+    gr.Blocks.load(
+        block=demo,
+        fn=core.update_df,
+        inputs=[shown_tasks, search_bar, langs_bar, model_types, fewshot],
+        outputs=leaderboard_table_misc,
+    )
+    gr.Blocks.load(
+        block=demo,
+        fn=core.update_plot,
+        inputs=[shown_tasks, search_bar, langs_bar, model_types, fewshot],
+        outputs=leaderboard_plot,
+    )
+demo.launch()

core.py ADDED Viewed

	@@ -0,0 +1,235 @@

+import itertools
+import os
+import gradio as gr
+import numpy as np
+import pandas as pd
+import plotly.express as px
+from datasets import load_dataset
+import style
+TAB_STATE = 0  # FIXME
+GSM8K_TASK_GROUP_NAME = "GSM8K"  # FIXME
+def init():
+    global repo_id, config_name, split_name, hidden_df, task_group_names_list, task_group_type_dict, task_groups_shots_dict, languages_list, model_type_dict
+    repo_id = os.getenv("OGX_LEADERBOARD_DATASET_NAME")
+    config_name = os.getenv("OGX_LEADERBOARD_DATASET_CONFIG")
+    split_name = os.getenv("OGX_LEADERBOARD_DATASET_SPLIT")
+    dataset = load_dataset(repo_id, config_name, split=split_name)
+    hidden_df = dataset.to_pandas()
+    task_group_names_list = hidden_df["Task_Group"].unique().tolist()
+    task_group_type_df = hidden_df[["Task_Group", "Task_Type"]].drop_duplicates()
+    task_group_type_dict = task_group_type_df.set_index("Task_Group")["Task_Type"].to_dict()
+    task_groups_shots_df = hidden_df[hidden_df["Few_Shot"] == True][["Task_Group", "Number_Shots"]].drop_duplicates()
+    task_groups_shots_dict = task_groups_shots_df.set_index("Task_Group")["Number_Shots"].to_dict()
+    languages_list = hidden_df["Language"].drop_duplicates().str.upper().tolist()
+    model_type_df = hidden_df[["Model_Name", "Model_Type"]].drop_duplicates()
+    model_type_dict = model_type_df.set_index("Model_Name")["Model_Type"].to_dict()
+    hidden_df = hidden_df.pivot_table(
+        columns=["Task_Group", "Few_Shot", "Language"],
+        index=["Model_Name"],
+        values="Value",
+        dropna=False,
+    ).reset_index(inplace=False)
+    hidden_df["Type"] = hidden_df["Model_Name"].apply(lambda x: style.T_SYMBOLS[model_type_dict[x]])
+def sort_cols(df: pd.DataFrame, fewshot: bool = False) -> pd.DataFrame:
+    task_cols = get_task_columns(df)
+    if fewshot:
+        renamer = {col: f"{col} ({task_groups_shots_dict[col]}-shot)" for col in task_cols if col in task_groups_shots_dict}
+        df.rename(columns=renamer, inplace=True)
+        task_cols = renamer.values()
+    return df.reindex(["Type", "Model_Name", "Average"] + sorted(task_cols), axis=1)
+def get_task_columns(df: pd.DataFrame) -> pd.DataFrame:
+    l = list(df.columns)
+    l.remove("Model_Name")
+    l.remove("Average")
+    l.remove("Type")
+    return l
+def get_models(df: pd.DataFrame) -> pd.DataFrame:
+    return df["Model_Name"].unique()
+def filter_type(df: pd.DataFrame, model_types: list[str]) -> pd.DataFrame:
+    """Keep only rows for which model type is in list of types"""
+    return df[df["Type"].isin(model_types)]
+def search_model(df: pd.DataFrame, query: str) -> pd.DataFrame:
+    """Keep only rows for which model name matches search query"""
+    query = query.replace(";", "|")
+    return df[df["Model_Name"].str.contains(query, case=False)]
+def aggregate_langs(df: pd.DataFrame, tasks: list, langs: list):
+    """Aggregates results over langs for each task in tasks.
+    If a language does not exist for a task, the aggregate for
+    that task will be shown as NaN.
+    """
+    langs_lower = [item.lower() for item in langs]
+    df.columns = ["_".join(filter(None, col)) for col in df.columns]
+    colset = set(df.columns)
+    for t in tasks:
+        cols = [(f"{a}_{b}") for a, b in itertools.product([t], langs_lower)]
+        if set(cols).issubset(colset):
+            df.loc[:, t] = df[cols].mean(axis=1, skipna=False)
+        else:
+            df.loc[:, t] = np.nan
+    df.loc[:, "Average"] = df[tasks].mean(axis=1)
+    return df[["Type", "Model_Name", "Average"] + tasks]
+def select_shots(df: pd.DataFrame, fewshot: bool = False):
+    cols = [col for col in df.columns if col[1] == fewshot] + []
+    # Move model name and type icon to the end
+    cols.append(("Model_Name", "", ""))
+    cols.append(("Type", "", ""))
+    return df[cols].droplevel(level=1, axis="columns")
+def update_df(
+    tasks: list[str],
+    model_query: str,
+    langs: list[str],
+    model_types: list[str],
+    fewshot: bool = False,
+    format: bool = True,
+) -> pd.DataFrame:
+    """Return a filtered dataframe according to selected models, tasks and
+    languages. The format flag controls whether the output dataframe should
+    be formatted to tw significant figures.
+    """
+    # keep only selected shots
+    df = select_shots(hidden_df, fewshot)
+    # aggregate results over languages per task
+    df = aggregate_langs(df, tasks, langs)
+    # filter models by search bar and model type
+    df = search_model(df, model_query)
+    df = filter_type(df, model_types)
+    if format:
+        return sort_cols(df, fewshot).style.format(precision=2, decimal=".")
+    else:
+        return sort_cols(df, fewshot)
+def make_plot(df: pd.DataFrame):
+    df.columns = df.loc["Model_Name"]
+    df = df.drop("Model_Name")
+    df = df.reset_index(names="task")
+    if len(df.columns) > 2:
+        fig = px.line(data_frame=df, x="task", y=df.columns, markers=True, width=1200)
+    else:
+        fig = px.bar(data_frame=df, x="task", y=df.columns[-1], width=1200)
+    fig.update_xaxes(type="category")
+    return fig
+def update_plot(
+    tasks: list[str],
+    model_query: str,
+    langs: list[str],
+    model_types: list[str],
+    fewshot: bool = False,
+):
+    df = update_df(tasks, model_query, langs, model_types, fewshot, False).transpose()
+    plot = make_plot(df)
+    return plot
+def fix_zeroshot(tasks: list[str | int | float], fewshot: bool = False):
+    global TAB_STATE
+    selected_task_type = get_selected_task_type(TAB_STATE)
+    choices = task_groups_with_task_type(selected_task_type)
+    if not fewshot:
+        try:
+            choices.remove(GSM8K_TASK_GROUP_NAME)
+        except ValueError:
+            pass
+        value = [v for v in tasks if v in choices]
+    else:
+        if TAB_STATE == 0:
+            value = [v for v in tasks if v in choices] + [GSM8K_TASK_GROUP_NAME]
+        elif TAB_STATE == 1:
+            value = [v for v in tasks if v in choices]
+    shown_tasks = gr.CheckboxGroup(
+        choices=choices,
+        value=value,
+        label="Select tasks to show",
+        elem_id="column-select",
+        interactive=True,
+        scale=50,
+    )
+    return shown_tasks
+def update_tab_tasks(id: int, fewshot: bool = False):
+    # when the tab is changed, update the TAB_STATE accordingly
+    global TAB_STATE
+    TAB_STATE = id
+    selected_task_type = get_selected_task_type(TAB_STATE)
+    choices = task_groups_with_task_type(selected_task_type)
+    if not fewshot:
+        try:
+            choices.remove(GSM8K_TASK_GROUP_NAME)
+        except ValueError:
+            pass
+    values = choices.copy()
+    shown_tasks = gr.CheckboxGroup(
+        choices=choices,
+        value=values,
+        label="Select tasks to show",
+        elem_id="column-select",
+        interactive=True,
+        scale=50,
+    )
+    if id == 0:
+        # switching to accuracy tab, default to fewshot
+        fewshot = gr.Radio(
+            choices=[("0-Shot", False), ("Few-shot", True)],
+            value=True,
+            label="Select evaluation type",
+            interactive=True,
+            scale=29,
+        )
+    elif id == 1:
+        # switching to translation tab, default to 0-shot and disable selection
+        fewshot = gr.Radio(
+            choices=[("0-Shot", False), ("Few-shot", True)],
+            value=False,
+            label="Select evaluation type",
+            interactive=False,
+            scale=29,
+        )
+    return [shown_tasks, fewshot]
+def get_selected_task_type(task_type_id):
+    task_types = {0: "accuracy", 1: "misc"}
+    selected_task_type = task_types[task_type_id]
+    return selected_task_type
+def task_groups_with_task_type(selected_task_type):
+    choices = [task_group_name for task_group_name, task_type in task_group_type_dict.items() if task_type == selected_task_type]
+    return choices
+init()

pyproject.toml ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ [tool.black]
2	+ line-length = 250

requirements.txt ADDED Viewed

	@@ -0,0 +1,20 @@

+APScheduler==3.10.1
+black==23.11.0
+click==8.1.3
+datasets==2.14.5
+gradio==4.19.2
+gradio_client==0.10.1
+huggingface-hub>=0.18.0
+markdown-it-py==2.2.0
+MarkupSafe==2.1.2
+matplotlib==3.7.1
+numpy==1.24.2
+pandas==2.0.0
+plotly==5.14.1
+python-dateutil==2.8.2
+requests==2.28.2
+semantic-version==2.10.0
+tqdm==4.65.0
+transformers==4.35.2
+tokenizers>=0.15.0
+openpyxl>=3.1.2<4.0.0

style.py ADDED Viewed

	@@ -0,0 +1,16 @@

+TITLE = """<h1 align="center" id="space-title">OpenGPT-X Multilingual LLM Leaderboard</h1>"""
+CSS = """
+#plot {
+  height: 512px;
+  display: flex;
+  justify-content: center;
+  align-items: center;
+}
+.modebar{
+      display: none !important;
+}
+"""
+T_SYMBOLS = {
+    "pretrained": "🟢",
+    "chat": "💬"
+}