Spaces:

openGPT-X
/

european-llm-leaderboard

Running

App Files Files Community

european-llm-leaderboard / core.py

jjbuschhoff

Temporary mechanism for filtering tasks from zero- or few-shot view

65504f2 3 months ago

raw

history blame

7.7 kB

	import itertools
	import os

	import gradio as gr
	import numpy as np
	import pandas as pd
	import plotly.express as px
	from datasets import load_dataset

	import style

	TAB_STATE = 0 # FIXME
	NO_FEWSHOT = ["BELEBELE"] # FIXME
	NO_ZEROSHOT = ["GSM8K"] # FIXME


	def init():
	global repo_id, config_name, split_name, hidden_df, task_group_names_list, task_group_type_dict, task_groups_shots_dict, languages_list, model_type_dict

	repo_id = os.getenv("OGX_LEADERBOARD_DATASET_NAME")
	config_name = os.getenv("OGX_LEADERBOARD_DATASET_CONFIG")
	split_name = os.getenv("OGX_LEADERBOARD_DATASET_SPLIT")

	dataset = load_dataset(repo_id, config_name, split=split_name)
	hidden_df = dataset.to_pandas()

	task_group_names_list = hidden_df["Task_Group"].unique().tolist()
	task_group_type_df = hidden_df[["Task_Group", "Task_Type"]].drop_duplicates()
	task_group_type_dict = task_group_type_df.set_index("Task_Group")["Task_Type"].to_dict()
	task_groups_shots_df = hidden_df[hidden_df["Few_Shot"] == True][["Task_Group", "Number_Shots"]].drop_duplicates()
	task_groups_shots_dict = task_groups_shots_df.set_index("Task_Group")["Number_Shots"].to_dict()
	languages_list = hidden_df["Language"].drop_duplicates().str.upper().tolist()
	model_type_df = hidden_df[["Model_Name", "Model_Type"]].drop_duplicates()
	model_type_dict = model_type_df.set_index("Model_Name")["Model_Type"].to_dict()

	hidden_df = hidden_df.pivot_table(
	columns=["Task_Group", "Few_Shot", "Language"],
	index=["Model_Name"],
	values="Value",
	dropna=False,
	).reset_index(inplace=False)

	hidden_df["Type"] = hidden_df["Model_Name"].apply(lambda x: style.T_SYMBOLS[model_type_dict[x]])


	def sort_cols(df: pd.DataFrame, fewshot: bool = False) -> pd.DataFrame:
	task_cols = get_task_columns(df)
	return df.reindex(["Type", "Model_Name", "Average"] + sorted(task_cols), axis=1)


	def get_task_columns(df: pd.DataFrame) -> pd.DataFrame:
	l = list(df.columns)
	l.remove("Model_Name")
	l.remove("Average")
	l.remove("Type")
	return l


	def get_models(df: pd.DataFrame) -> pd.DataFrame:
	return df["Model_Name"].unique()


	def filter_type(df: pd.DataFrame, model_types: list[str]) -> pd.DataFrame:
	"""Keep only rows for which model type is in list of types"""
	return df[df["Type"].isin(model_types)]


	def search_model(df: pd.DataFrame, query: str) -> pd.DataFrame:
	"""Keep only rows for which model name matches search query"""
	query = query.replace(";", "\|")
	return df[df["Model_Name"].str.contains(query, case=False)]


	def aggregate_langs(df: pd.DataFrame, tasks: list, langs: list):
	"""Aggregates results over langs for each task in tasks.
	If a language does not exist for a task, the aggregate for
	that task will be shown as NaN.
	"""

	langs_lower = [item.lower() for item in langs]
	df.columns = ["_".join(filter(None, col)) for col in df.columns]
	colset = set(df.columns)
	for t in tasks:
	cols = [(f"{a}_{b}") for a, b in itertools.product([t], langs_lower)]
	if set(cols).issubset(colset):
	df.loc[:, t] = df[cols].mean(axis=1, skipna=False)
	else:
	df.loc[:, t] = np.nan
	df.loc[:, "Average"] = df[tasks].mean(axis=1)
	return df[["Type", "Model_Name", "Average"] + tasks]


	def select_shots(df: pd.DataFrame, fewshot: bool = False):
	cols = [col for col in df.columns if col[1] == fewshot] + []
	# Move model name and type icon to the end
	cols.append(("Model_Name", "", ""))
	cols.append(("Type", "", ""))
	return df[cols].droplevel(level=1, axis="columns")


	def update_df(
	tasks: list[str],
	model_query: str,
	langs: list[str],
	model_types: list[str],
	fewshot: bool = False,
	format: bool = True,
	) -> pd.DataFrame:
	"""Return a filtered dataframe according to selected models, tasks and
	languages. The format flag controls whether the output dataframe should
	be formatted to tw significant figures.
	"""
	# keep only selected shots
	df = select_shots(hidden_df, fewshot)

	# aggregate results over languages per task
	df = aggregate_langs(df, tasks, langs)

	# filter models by search bar and model type
	df = search_model(df, model_query)
	df = filter_type(df, model_types)

	if format:
	return sort_cols(df, fewshot).style.format(precision=2, decimal=".", na_rep="N/A")
	else:
	return sort_cols(df, fewshot)


	def make_plot(df: pd.DataFrame):
	df.columns = df.loc["Model_Name"]
	df = df.drop("Model_Name")
	df = df.reset_index(names="task")
	if len(df.columns) > 2:
	fig = px.line(data_frame=df, x="task", y=df.columns, markers=True, width=1200)
	else:
	fig = px.bar(data_frame=df, x="task", y=df.columns[-1], width=1200)
	fig.update_xaxes(type="category")
	return fig


	def update_plot(
	tasks: list[str],
	model_query: str,
	langs: list[str],
	model_types: list[str],
	fewshot: bool = False,
	):
	df = update_df(tasks, model_query, langs, model_types, fewshot, False).transpose()
	plot = make_plot(df)
	return plot


	def fix_zeroshot(tasks: list[str \| int \| float], fewshot: bool = False):
	global TAB_STATE
	selected_task_type = get_selected_task_type(TAB_STATE)
	choices = task_groups_with_task_type(selected_task_type)
	if not fewshot:
	choices = [c for c in choices if c not in NO_ZEROSHOT]
	value = [v for v in tasks if v in choices]
	value += [t for t in NO_FEWSHOT if t not in value]
	else:
	if TAB_STATE == 0:
	choices = [c for c in choices if c not in NO_FEWSHOT]
	value = [v for v in tasks if v in choices]
	value += [t for t in NO_ZEROSHOT if t not in value]
	elif TAB_STATE == 1:
	value = [v for v in tasks if v in choices]
	shown_tasks = gr.CheckboxGroup(
	choices=choices,
	value=value,
	label="Select tasks to show",
	elem_id="column-select",
	interactive=True,
	scale=50,
	)
	return shown_tasks


	def update_tab_tasks(id: int, fewshot: bool = False):
	# when the tab is changed, update the TAB_STATE accordingly
	global TAB_STATE
	TAB_STATE = id
	selected_task_type = get_selected_task_type(TAB_STATE)
	choices = task_groups_with_task_type(selected_task_type)
	if not fewshot:
	choices = [c for c in choices if c not in NO_ZEROSHOT]
	values = choices.copy()
	shown_tasks = gr.CheckboxGroup(
	choices=choices,
	value=values,
	label="Select tasks to show",
	elem_id="column-select",
	interactive=True,
	scale=50,
	)
	if id == 0:
	# switching to accuracy tab, default to fewshot
	fewshot = gr.Radio(
	choices=[("0-Shot", False), ("Few-shot", True)],
	value=True,
	label="Select evaluation type",
	interactive=True,
	scale=29,
	)
	elif id == 1:
	# switching to translation tab, default to 0-shot and disable selection
	fewshot = gr.Radio(
	choices=[("0-Shot", False), ("Few-shot", True)],
	value=False,
	label="Select evaluation type",
	interactive=False,
	scale=29,
	)
	return [shown_tasks, fewshot]


	def get_selected_task_type(task_type_id):
	task_types = {0: "accuracy", 1: "misc"}
	selected_task_type = task_types[task_type_id]
	return selected_task_type


	def task_groups_with_task_type(selected_task_type):
	choices = [task_group_name for task_group_name, task_type in task_group_type_dict.items() if task_type == selected_task_type]

	return choices


	init()