Spaces:

openGPT-X
/

european-llm-leaderboard

Running

App Files Files Community

european-llm-leaderboard / app.py

KlaudiaTH

Release version of leaderboard implementation

2b62c4c 3 months ago

raw

history blame

No virus

6.45 kB

	import gradio as gr

	import core as core
	from style import CSS, T_SYMBOLS, TITLE

	demo = gr.Blocks(css=CSS)
	with demo:
	gr.HTML(TITLE)
	gr.Markdown(
	"This is a (WIP) collection of multilingual evaluation results obtained using our fork of the LM-evaluation-harness (https://github.com/OpenGPTX/lm-evaluation-harness), based on https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard.\
	Note that currently, not all benchmarks are available in all languages, results are averaged over those languages under the selected ones for which the benchmark is available.",
	elem_classes="markdown-text",
	)

	with gr.Column():
	with gr.Row():
	with gr.Column():
	with gr.Row():
	search_bar = gr.Textbox(
	label="Search models",
	placeholder=" 🔍 Separate multiple queries with ';' and press ENTER...",
	show_label=True,
	elem_id="search-bar",
	)

	model_types = gr.CheckboxGroup(
	label="Select model type",
	choices=[
	(
	f"Pretrained {T_SYMBOLS['pretrained']}",
	T_SYMBOLS["pretrained"],
	),
	(f"Chat {T_SYMBOLS['chat']}", T_SYMBOLS["chat"]),
	],
	value=list(T_SYMBOLS.values()),
	)
	with gr.Row():
	langs_bar = gr.CheckboxGroup(
	choices=core.languages_list,
	value=core.languages_list,
	label="Select languages to average over",
	elem_id="column-select",
	interactive=True,
	scale=6,
	)
	with gr.Column(scale=1):
	clear = gr.ClearButton(
	langs_bar,
	value="Deselect all languages",
	size="sm",
	scale=1,
	)
	select = gr.Button(
	value="Select all languages", size="sm", scale=1
	)

	def update_bar():
	langs_bar = gr.CheckboxGroup(
	choices=core.languages_list,
	value=core.languages_list,
	label="Select languages to average over",
	elem_id="column-select",
	interactive=True,
	)
	return langs_bar

	select.click(update_bar, inputs=[], outputs=langs_bar)

	with gr.Row():
	acc_task_group_names = core.task_groups_with_task_type("accuracy")
	shown_tasks = gr.CheckboxGroup(
	choices=acc_task_group_names,
	value=acc_task_group_names,
	label="Select tasks to show",
	elem_id="column-select",
	interactive=True,
	scale=50,
	)
	fewshot = gr.Radio(
	choices=[("0-Shot", False), ("Few-shot", True)],
	value=True,
	label="Select evaluation type",
	interactive=True,
	scale=29,
	)
	fewshot.change(
	core.fix_zeroshot, [shown_tasks, fewshot], shown_tasks
	)
	clear = gr.ClearButton(
	shown_tasks, value="Deselect all tasks", size="sm", scale=21
	)

	with gr.Tabs(elem_classes="tab-buttons") as tabs:
	with gr.TabItem(
	"🏅 LLM accuracy benchmark", elem_id="llm-benchmark-tab-table-acc", id=0
	) as acc:
	leaderboard_table = gr.Dataframe()
	with gr.TabItem(
	"🌐 LLM translation benchmark",
	elem_id="llm-benchmark-tab-table-misc",
	id=1,
	) as misc:
	leaderboard_table_misc = gr.Dataframe()
	with gr.TabItem("Plots", elem_id="llm-plot-tab", id=2) as plot:
	leaderboard_plot = gr.Plot(elem_id="plot")
	acc.select(
	lambda x: core.update_tab_tasks(0, x),
	inputs=fewshot,
	outputs=[shown_tasks, fewshot],
	)
	misc.select(
	lambda x: core.update_tab_tasks(1, x),
	inputs=fewshot,
	outputs=[shown_tasks, fewshot],
	)
	for comp, fn in [
	(search_bar, "submit"),
	(langs_bar, "change"),
	(shown_tasks, "change"),
	(fewshot, "change"),
	(model_types, "change"),
	]:
	getattr(comp, fn)(
	core.update_df,
	[shown_tasks, search_bar, langs_bar, model_types, fewshot],
	leaderboard_table,
	)
	getattr(comp, fn)(
	core.update_df,
	[shown_tasks, search_bar, langs_bar, model_types, fewshot],
	leaderboard_table_misc,
	)
	getattr(comp, fn)(
	core.update_plot,
	[shown_tasks, search_bar, langs_bar, model_types, fewshot],
	leaderboard_plot,
	)

	gr.Blocks.load(
	block=demo,
	fn=core.update_df,
	inputs=[shown_tasks, search_bar, langs_bar, model_types, fewshot],
	outputs=leaderboard_table,
	)

	gr.Blocks.load(
	block=demo,
	fn=core.update_df,
	inputs=[shown_tasks, search_bar, langs_bar, model_types, fewshot],
	outputs=leaderboard_table_misc,
	)

	gr.Blocks.load(
	block=demo,
	fn=core.update_plot,
	inputs=[shown_tasks, search_bar, langs_bar, model_types, fewshot],
	outputs=leaderboard_plot,
	)

	demo.launch()