import gradio as gr import core as core from style import CSS, LANG_SYMBOLS, T_SYMBOLS, TITLE demo = gr.Blocks(css=CSS) with demo: gr.HTML(TITLE) gr.Markdown( "This is a collection of multilingual evaluation results obtained using our fork of the LM-evaluation-harness (https://github.com/OpenGPTX/lm-evaluation-harness), based on V1 of the https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard.\ Note that currently, benchmarks are available in 21 European languages (Irish, Maltese, Croatian missing).", elem_classes="markdown-text", ) selected_tab = gr.State(value=0) with gr.Column(): with gr.Row(): with gr.Column(): with gr.Row(): search_bar = gr.Textbox( label="Search models", placeholder=" 🔍 Separate multiple queries with ';' and press ENTER...", show_label=True, elem_id="search-bar", ) model_types = gr.CheckboxGroup( label="Select model type", choices=[ ( f"Pretrained {T_SYMBOLS['pretrained']}", T_SYMBOLS["pretrained"], ), (f"Chat {T_SYMBOLS['chat']}", T_SYMBOLS["chat"]), ], value=list(T_SYMBOLS.values()), ) with gr.Row(): langs_bar = gr.CheckboxGroup( choices=[(LANG_SYMBOLS.get(l, l), l) for l in core.languages_list], value=core.languages_list, label="Select languages to average over", elem_id="column-select", interactive=True, scale=6, ) with gr.Column(scale=1): clear = gr.ClearButton( langs_bar, value="Deselect all languages", size="sm", scale=1, ) select = gr.Button(value="Select all languages", size="sm", scale=1) def update_bar(selected_tab): if selected_tab in [0, 1]: choices = [(LANG_SYMBOLS.get(l, l), l) for l in core.languages_list] value = core.languages_list else: raise ValueError langs_bar = gr.CheckboxGroup( choices=choices, value=value, label="Select languages to average over", elem_id="column-select", interactive=True, ) return langs_bar select.click(update_bar, inputs=[selected_tab], outputs=langs_bar) with gr.Row(): shown_tasks = gr.CheckboxGroup( choices=[], value=[], label="Select tasks to show", elem_id="column-select", interactive=True, scale=50, ) fewshot = gr.Radio( choices=[("0-Shot", False), ("Few-shot", True)], value=True, label="Select evaluation type", scale=29, ) clear = gr.ClearButton(shown_tasks, value="Deselect all tasks", size="sm", scale=21) with gr.Tabs(elem_classes="tab-buttons") as tabs: with gr.TabItem("🏅 LLM accuracy benchmark", elem_id="llm-benchmark-tab-table-acc", id=0) as acc: leaderboard_table = gr.Dataframe() with gr.TabItem( "🌐 LLM translation benchmark", elem_id="llm-benchmark-tab-table-misc", id=1, ) as misc: leaderboard_table_misc = gr.Dataframe() demo.load( core.update_task_groups_and_fewshot, [gr.State(value=0), model_types, langs_bar, fewshot], [shown_tasks, fewshot, selected_tab, model_types, langs_bar], ) fewshot.change( core.update_task_groups_and_fewshot, [selected_tab, model_types, langs_bar, fewshot], [shown_tasks, fewshot, selected_tab, model_types, langs_bar], ) acc.select( core.update_task_groups_and_fewshot, inputs=[gr.State(value=0), model_types, langs_bar, fewshot], outputs=[shown_tasks, fewshot, selected_tab, model_types, langs_bar], ) misc.select( core.update_task_groups_and_fewshot, inputs=[gr.State(value=1), model_types, langs_bar, fewshot], outputs=[shown_tasks, fewshot, selected_tab, model_types, langs_bar], ) for comp, fn in [ (search_bar, "submit"), (langs_bar, "change"), (shown_tasks, "change"), (fewshot, "change"), (model_types, "change"), ]: getattr(comp, fn)( core.update_df, [shown_tasks, search_bar, langs_bar, model_types, fewshot], leaderboard_table, ) getattr(comp, fn)( core.update_df, [shown_tasks, search_bar, langs_bar, model_types, fewshot], leaderboard_table_misc, ) gr.Blocks.load( block=demo, fn=core.update_df, inputs=[shown_tasks, search_bar, langs_bar, model_types, fewshot], outputs=leaderboard_table, ) gr.Blocks.load( block=demo, fn=core.update_df, inputs=[shown_tasks, search_bar, langs_bar, model_types, fewshot], outputs=leaderboard_table_misc, ) demo.launch()