Spaces:

openGPT-X
/

european-llm-leaderboard

Running

App Files Files Community

european-llm-leaderboard / core.py

KlaudiaTH

Fix: Don't display Belebele in few-shot

5e9ed15 3 months ago

raw

history blame

No virus

8.38 kB

	import itertools
	import os

	import gradio as gr
	import numpy as np
	import pandas as pd
	import plotly.express as px
	from datasets import load_dataset

	import style

	TAB_STATE = 0 # FIXME
	GSM8K_TASK_GROUP_NAME = "GSM8K" # FIXME
	BELEBELE_TASK_GROUP_NAME = "BELEBELE" # FIXME


	def init():
	global repo_id, config_name, split_name, hidden_df, task_group_names_list, task_group_type_dict, task_groups_shots_dict, languages_list, model_type_dict

	repo_id = os.getenv("OGX_LEADERBOARD_DATASET_NAME")
	config_name = os.getenv("OGX_LEADERBOARD_DATASET_CONFIG")
	split_name = os.getenv("OGX_LEADERBOARD_DATASET_SPLIT")

	dataset = load_dataset(repo_id, config_name, split=split_name)
	hidden_df = dataset.to_pandas()

	task_group_names_list = hidden_df["Task_Group"].unique().tolist()
	task_group_type_df = hidden_df[["Task_Group", "Task_Type"]].drop_duplicates()
	task_group_type_dict = task_group_type_df.set_index("Task_Group")["Task_Type"].to_dict()
	task_groups_shots_df = hidden_df[hidden_df["Few_Shot"] == True][["Task_Group", "Number_Shots"]].drop_duplicates()
	task_groups_shots_dict = task_groups_shots_df.set_index("Task_Group")["Number_Shots"].to_dict()
	languages_list = hidden_df["Language"].drop_duplicates().str.upper().tolist()
	model_type_df = hidden_df[["Model_Name", "Model_Type"]].drop_duplicates()
	model_type_dict = model_type_df.set_index("Model_Name")["Model_Type"].to_dict()

	hidden_df = hidden_df.pivot_table(
	columns=["Task_Group", "Few_Shot", "Language"],
	index=["Model_Name"],
	values="Value",
	dropna=False,
	).reset_index(inplace=False)

	hidden_df["Type"] = hidden_df["Model_Name"].apply(lambda x: style.T_SYMBOLS[model_type_dict[x]])


	def sort_cols(df: pd.DataFrame, fewshot: bool = False) -> pd.DataFrame:
	task_cols = get_task_columns(df)
	if fewshot:
	renamer = {col: f"{col} ({task_groups_shots_dict[col]}-shot)" for col in task_cols if col in task_groups_shots_dict}
	df.rename(columns=renamer, inplace=True)
	task_cols = renamer.values()
	return df.reindex(["Type", "Model_Name", "Average"] + sorted(task_cols), axis=1)


	def get_task_columns(df: pd.DataFrame) -> pd.DataFrame:
	l = list(df.columns)
	l.remove("Model_Name")
	l.remove("Average")
	l.remove("Type")
	return l


	def get_models(df: pd.DataFrame) -> pd.DataFrame:
	return df["Model_Name"].unique()


	def filter_type(df: pd.DataFrame, model_types: list[str]) -> pd.DataFrame:
	"""Keep only rows for which model type is in list of types"""
	return df[df["Type"].isin(model_types)]


	def search_model(df: pd.DataFrame, query: str) -> pd.DataFrame:
	"""Keep only rows for which model name matches search query"""
	query = query.replace(";", "\|")
	return df[df["Model_Name"].str.contains(query, case=False)]


	def aggregate_langs(df: pd.DataFrame, tasks: list, langs: list):
	"""Aggregates results over langs for each task in tasks.
	If a language does not exist for a task, the aggregate for
	that task will be shown as NaN.
	"""

	langs_lower = [item.lower() for item in langs]
	df.columns = ["_".join(filter(None, col)) for col in df.columns]
	colset = set(df.columns)
	for t in tasks:
	cols = [(f"{a}_{b}") for a, b in itertools.product([t], langs_lower)]
	if set(cols).issubset(colset):
	df.loc[:, t] = df[cols].mean(axis=1, skipna=False)
	else:
	df.loc[:, t] = np.nan
	df.loc[:, "Average"] = df[tasks].mean(axis=1)
	return df[["Type", "Model_Name", "Average"] + tasks]


	def select_shots(df: pd.DataFrame, fewshot: bool = False):
	cols = [col for col in df.columns if col[1] == fewshot] + []
	# Move model name and type icon to the end
	cols.append(("Model_Name", "", ""))
	cols.append(("Type", "", ""))
	return df[cols].droplevel(level=1, axis="columns")


	def update_df(
	tasks: list[str],
	model_query: str,
	langs: list[str],
	model_types: list[str],
	fewshot: bool = False,
	format: bool = True,
	) -> pd.DataFrame:
	"""Return a filtered dataframe according to selected models, tasks and
	languages. The format flag controls whether the output dataframe should
	be formatted to tw significant figures.
	"""
	# keep only selected shots
	df = select_shots(hidden_df, fewshot)

	# aggregate results over languages per task
	df = aggregate_langs(df, tasks, langs)

	# filter models by search bar and model type
	df = search_model(df, model_query)
	df = filter_type(df, model_types)

	if format:
	return sort_cols(df, fewshot).style.format(precision=2, decimal=".")
	else:
	return sort_cols(df, fewshot)


	def make_plot(df: pd.DataFrame):
	df.columns = df.loc["Model_Name"]
	df = df.drop("Model_Name")
	df = df.reset_index(names="task")
	if len(df.columns) > 2:
	fig = px.line(data_frame=df, x="task", y=df.columns, markers=True, width=1200)
	else:
	fig = px.bar(data_frame=df, x="task", y=df.columns[-1], width=1200)
	fig.update_xaxes(type="category")
	return fig


	def update_plot(
	tasks: list[str],
	model_query: str,
	langs: list[str],
	model_types: list[str],
	fewshot: bool = False,
	):
	df = update_df(tasks, model_query, langs, model_types, fewshot, False).transpose()
	plot = make_plot(df)
	return plot


	def fix_zeroshot(tasks: list[str \| int \| float], fewshot: bool = False):
	global TAB_STATE
	selected_task_type = get_selected_task_type(TAB_STATE)
	choices = task_groups_with_task_type(selected_task_type)
	if not fewshot:
	try:
	choices.remove(GSM8K_TASK_GROUP_NAME)
	except ValueError:
	pass
	if TAB_STATE == 0:
	value = [v for v in tasks if v in choices]
	if BELEBELE_TASK_GROUP_NAME not in value:
	value += [BELEBELE_TASK_GROUP_NAME]
	elif TAB_STATE == 1:
	value = [v for v in tasks if v in choices]
	else:
	try:
	choices.remove(BELEBELE_TASK_GROUP_NAME)
	except ValueError:
	pass
	if TAB_STATE == 0:
	value = [v for v in tasks if v in choices]
	if GSM8K_TASK_GROUP_NAME not in value:
	value += [GSM8K_TASK_GROUP_NAME]
	elif TAB_STATE == 1:
	value = [v for v in tasks if v in choices]
	shown_tasks = gr.CheckboxGroup(
	choices=choices,
	value=value,
	label="Select tasks to show",
	elem_id="column-select",
	interactive=True,
	scale=50,
	)
	return shown_tasks


	def update_tab_tasks(id: int, fewshot: bool = False):
	# when the tab is changed, update the TAB_STATE accordingly
	global TAB_STATE
	TAB_STATE = id
	selected_task_type = get_selected_task_type(TAB_STATE)
	choices = task_groups_with_task_type(selected_task_type)
	if not fewshot:
	try:
	choices.remove(GSM8K_TASK_GROUP_NAME)
	except ValueError:
	pass
	else:
	try:
	choices.remove(BELEBELE_TASK_GROUP_NAME)
	except ValueError:
	pass

	values = choices.copy()
	shown_tasks = gr.CheckboxGroup(
	choices=choices,
	value=values,
	label="Select tasks to show",
	elem_id="column-select",
	interactive=True,
	scale=50,
	)
	if id == 0:
	# switching to accuracy tab, default to fewshot
	fewshot = gr.Radio(
	choices=[("0-Shot", False), ("Few-shot", True)],
	value=True,
	label="Select evaluation type",
	interactive=True,
	scale=29,
	)
	elif id == 1:
	# switching to translation tab, default to 0-shot and disable selection
	fewshot = gr.Radio(
	choices=[("0-Shot", False), ("Few-shot", True)],
	value=False,
	label="Select evaluation type",
	interactive=False,
	scale=29,
	)
	return [shown_tasks, fewshot]


	def get_selected_task_type(task_type_id):
	task_types = {0: "accuracy", 1: "misc"}
	selected_task_type = task_types[task_type_id]
	return selected_task_type


	def task_groups_with_task_type(selected_task_type):
	choices = [task_group_name for task_group_name, task_type in task_group_type_dict.items() if task_type == selected_task_type]

	return choices


	init()