Spaces:

openGPT-X
/

european-llm-leaderboard

Running

File size: 6,577 Bytes

import gradio as gr

import core as core
from style import CSS, T_SYMBOLS, TITLE

demo = gr.Blocks(css=CSS)
with demo:
    gr.HTML(TITLE)
    gr.Markdown(
        "This is a (WIP) collection of multilingual evaluation results obtained using our fork of the LM-evaluation-harness (https://github.com/OpenGPTX/lm-evaluation-harness), based on https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard.\
                Note that currently, not all benchmarks are available in all languages, results are averaged over those languages under the selected ones for which the benchmark is available.",
        elem_classes="markdown-text",
    )

    with gr.Column():
        with gr.Row():
            with gr.Column():
                with gr.Row():
                    search_bar = gr.Textbox(
                        label="Search models",
                        placeholder=" 🔍 Separate multiple queries with ';' and press ENTER...",
                        show_label=True,
                        elem_id="search-bar",
                    )

                    model_types = gr.CheckboxGroup(
                        label="Select model type",
                        choices=[
                            (
                                f"Pretrained {T_SYMBOLS['pretrained']}",
                                T_SYMBOLS["pretrained"],
                            ),
                            (f"Chat {T_SYMBOLS['chat']}", T_SYMBOLS["chat"]),
                        ],
                        value=list(T_SYMBOLS.values()),
                    )
                with gr.Row():
                    langs_bar = gr.CheckboxGroup(
                        choices=core.languages_list,
                        value=core.languages_list,
                        label="Select languages to average over",
                        elem_id="column-select",
                        interactive=True,
                        scale=6,
                    )
                    with gr.Column(scale=1):
                        clear = gr.ClearButton(
                            langs_bar,
                            value="Deselect all languages",
                            size="sm",
                            scale=1,
                        )
                        select = gr.Button(
                            value="Select all languages", size="sm", scale=1
                        )

                        def update_bar():
                            langs_bar = gr.CheckboxGroup(
                                choices=core.languages_list,
                                value=core.languages_list,
                                label="Select languages to average over",
                                elem_id="column-select",
                                interactive=True,
                            )
                            return langs_bar

                        select.click(update_bar, inputs=[], outputs=langs_bar)

                with gr.Row():
                    acc_task_group_names = core.task_groups_with_task_type("accuracy")
                    shown_tasks = gr.CheckboxGroup(
                        choices=acc_task_group_names,
                        value=acc_task_group_names,
                        label="Select tasks to show",
                        elem_id="column-select",
                        interactive=True,
                        scale=50,
                    )
                    fewshot = gr.Radio(
                        choices=[("0-Shot", False), ("Few-shot", True)],
                        value=True,
                        label="Select evaluation type",
                        interactive=True,
                        scale=29,
                    )
                    demo.load(
                        core.fix_zeroshot, [shown_tasks, fewshot], shown_tasks
                    )
                    fewshot.change(
                        core.fix_zeroshot, [shown_tasks, fewshot], shown_tasks
                    )
                    clear = gr.ClearButton(
                        shown_tasks, value="Deselect all tasks", size="sm", scale=21
                    )

        with gr.Tabs(elem_classes="tab-buttons") as tabs:
            with gr.TabItem(
                "🏅 LLM accuracy benchmark", elem_id="llm-benchmark-tab-table-acc", id=0
            ) as acc:
                leaderboard_table = gr.Dataframe()
            with gr.TabItem(
                "🌐 LLM translation benchmark",
                elem_id="llm-benchmark-tab-table-misc",
                id=1,
            ) as misc:
                leaderboard_table_misc = gr.Dataframe()
            with gr.TabItem("Plots", elem_id="llm-plot-tab", id=2) as plot:
                leaderboard_plot = gr.Plot(elem_id="plot")
            acc.select(
                lambda x: core.update_tab_tasks(0, x),
                inputs=fewshot,
                outputs=[shown_tasks, fewshot],
            )
            misc.select(
                lambda x: core.update_tab_tasks(1, x),
                inputs=fewshot,
                outputs=[shown_tasks, fewshot],
            )
            for comp, fn in [
                (search_bar, "submit"),
                (langs_bar, "change"),
                (shown_tasks, "change"),
                (fewshot, "change"),
                (model_types, "change"),
            ]:
                getattr(comp, fn)(
                    core.update_df,
                    [shown_tasks, search_bar, langs_bar, model_types, fewshot],
                    leaderboard_table,
                )
                getattr(comp, fn)(
                    core.update_df,
                    [shown_tasks, search_bar, langs_bar, model_types, fewshot],
                    leaderboard_table_misc,
                )
                getattr(comp, fn)(
                    core.update_plot,
                    [shown_tasks, search_bar, langs_bar, model_types, fewshot],
                    leaderboard_plot,
                )

    gr.Blocks.load(
        block=demo,
        fn=core.update_df,
        inputs=[shown_tasks, search_bar, langs_bar, model_types, fewshot],
        outputs=leaderboard_table,
    )

    gr.Blocks.load(
        block=demo,
        fn=core.update_df,
        inputs=[shown_tasks, search_bar, langs_bar, model_types, fewshot],
        outputs=leaderboard_table_misc,
    )

    gr.Blocks.load(
        block=demo,
        fn=core.update_plot,
        inputs=[shown_tasks, search_bar, langs_bar, model_types, fewshot],
        outputs=leaderboard_plot,
    )

demo.launch()