|
import itertools |
|
import os |
|
|
|
import gradio as gr |
|
import numpy as np |
|
import pandas as pd |
|
import plotly.express as px |
|
from datasets import load_dataset |
|
|
|
import style |
|
|
|
TAB_STATE = 0 |
|
GSM8K_TASK_GROUP_NAME = "GSM8K" |
|
BELEBELE_TASK_GROUP_NAME = "BELEBELE" |
|
|
|
|
|
def init(): |
|
global repo_id, config_name, split_name, hidden_df, task_group_names_list, task_group_type_dict, task_groups_shots_dict, languages_list, model_type_dict |
|
|
|
repo_id = os.getenv("OGX_LEADERBOARD_DATASET_NAME") |
|
config_name = os.getenv("OGX_LEADERBOARD_DATASET_CONFIG") |
|
split_name = os.getenv("OGX_LEADERBOARD_DATASET_SPLIT") |
|
|
|
dataset = load_dataset(repo_id, config_name, split=split_name) |
|
hidden_df = dataset.to_pandas() |
|
|
|
task_group_names_list = hidden_df["Task_Group"].unique().tolist() |
|
task_group_type_df = hidden_df[["Task_Group", "Task_Type"]].drop_duplicates() |
|
task_group_type_dict = task_group_type_df.set_index("Task_Group")["Task_Type"].to_dict() |
|
task_groups_shots_df = hidden_df[hidden_df["Few_Shot"] == True][["Task_Group", "Number_Shots"]].drop_duplicates() |
|
task_groups_shots_dict = task_groups_shots_df.set_index("Task_Group")["Number_Shots"].to_dict() |
|
languages_list = hidden_df["Language"].drop_duplicates().str.upper().tolist() |
|
model_type_df = hidden_df[["Model_Name", "Model_Type"]].drop_duplicates() |
|
model_type_dict = model_type_df.set_index("Model_Name")["Model_Type"].to_dict() |
|
|
|
hidden_df = hidden_df.pivot_table( |
|
columns=["Task_Group", "Few_Shot", "Language"], |
|
index=["Model_Name"], |
|
values="Value", |
|
dropna=False, |
|
).reset_index(inplace=False) |
|
|
|
hidden_df["Type"] = hidden_df["Model_Name"].apply(lambda x: style.T_SYMBOLS[model_type_dict[x]]) |
|
|
|
|
|
def sort_cols(df: pd.DataFrame, fewshot: bool = False) -> pd.DataFrame: |
|
task_cols = get_task_columns(df) |
|
if fewshot: |
|
renamer = {col: f"{col} ({task_groups_shots_dict[col]}-shot)" for col in task_cols if col in task_groups_shots_dict} |
|
df.rename(columns=renamer, inplace=True) |
|
task_cols = renamer.values() |
|
return df.reindex(["Type", "Model_Name", "Average"] + sorted(task_cols), axis=1) |
|
|
|
|
|
def get_task_columns(df: pd.DataFrame) -> pd.DataFrame: |
|
l = list(df.columns) |
|
l.remove("Model_Name") |
|
l.remove("Average") |
|
l.remove("Type") |
|
return l |
|
|
|
|
|
def get_models(df: pd.DataFrame) -> pd.DataFrame: |
|
return df["Model_Name"].unique() |
|
|
|
|
|
def filter_type(df: pd.DataFrame, model_types: list[str]) -> pd.DataFrame: |
|
"""Keep only rows for which model type is in list of types""" |
|
return df[df["Type"].isin(model_types)] |
|
|
|
|
|
def search_model(df: pd.DataFrame, query: str) -> pd.DataFrame: |
|
"""Keep only rows for which model name matches search query""" |
|
query = query.replace(";", "|") |
|
return df[df["Model_Name"].str.contains(query, case=False)] |
|
|
|
|
|
def aggregate_langs(df: pd.DataFrame, tasks: list, langs: list): |
|
"""Aggregates results over langs for each task in tasks. |
|
If a language does not exist for a task, the aggregate for |
|
that task will be shown as NaN. |
|
""" |
|
|
|
langs_lower = [item.lower() for item in langs] |
|
df.columns = ["_".join(filter(None, col)) for col in df.columns] |
|
colset = set(df.columns) |
|
for t in tasks: |
|
cols = [(f"{a}_{b}") for a, b in itertools.product([t], langs_lower)] |
|
if set(cols).issubset(colset): |
|
df.loc[:, t] = df[cols].mean(axis=1, skipna=False) |
|
else: |
|
df.loc[:, t] = np.nan |
|
df.loc[:, "Average"] = df[tasks].mean(axis=1) |
|
return df[["Type", "Model_Name", "Average"] + tasks] |
|
|
|
|
|
def select_shots(df: pd.DataFrame, fewshot: bool = False): |
|
cols = [col for col in df.columns if col[1] == fewshot] + [] |
|
|
|
cols.append(("Model_Name", "", "")) |
|
cols.append(("Type", "", "")) |
|
return df[cols].droplevel(level=1, axis="columns") |
|
|
|
|
|
def update_df( |
|
tasks: list[str], |
|
model_query: str, |
|
langs: list[str], |
|
model_types: list[str], |
|
fewshot: bool = False, |
|
format: bool = True, |
|
) -> pd.DataFrame: |
|
"""Return a filtered dataframe according to selected models, tasks and |
|
languages. The format flag controls whether the output dataframe should |
|
be formatted to tw significant figures. |
|
""" |
|
|
|
df = select_shots(hidden_df, fewshot) |
|
|
|
|
|
df = aggregate_langs(df, tasks, langs) |
|
|
|
|
|
df = search_model(df, model_query) |
|
df = filter_type(df, model_types) |
|
|
|
if format: |
|
return sort_cols(df, fewshot).style.format(precision=2, decimal=".") |
|
else: |
|
return sort_cols(df, fewshot) |
|
|
|
|
|
def make_plot(df: pd.DataFrame): |
|
df.columns = df.loc["Model_Name"] |
|
df = df.drop("Model_Name") |
|
df = df.reset_index(names="task") |
|
if len(df.columns) > 2: |
|
fig = px.line(data_frame=df, x="task", y=df.columns, markers=True, width=1200) |
|
else: |
|
fig = px.bar(data_frame=df, x="task", y=df.columns[-1], width=1200) |
|
fig.update_xaxes(type="category") |
|
return fig |
|
|
|
|
|
def update_plot( |
|
tasks: list[str], |
|
model_query: str, |
|
langs: list[str], |
|
model_types: list[str], |
|
fewshot: bool = False, |
|
): |
|
df = update_df(tasks, model_query, langs, model_types, fewshot, False).transpose() |
|
plot = make_plot(df) |
|
return plot |
|
|
|
|
|
def fix_zeroshot(tasks: list[str | int | float], fewshot: bool = False): |
|
global TAB_STATE |
|
selected_task_type = get_selected_task_type(TAB_STATE) |
|
choices = task_groups_with_task_type(selected_task_type) |
|
if not fewshot: |
|
try: |
|
choices.remove(GSM8K_TASK_GROUP_NAME) |
|
except ValueError: |
|
pass |
|
if TAB_STATE == 0: |
|
value = [v for v in tasks if v in choices] |
|
if BELEBELE_TASK_GROUP_NAME not in value: |
|
value += [BELEBELE_TASK_GROUP_NAME] |
|
elif TAB_STATE == 1: |
|
value = [v for v in tasks if v in choices] |
|
else: |
|
try: |
|
choices.remove(BELEBELE_TASK_GROUP_NAME) |
|
except ValueError: |
|
pass |
|
if TAB_STATE == 0: |
|
value = [v for v in tasks if v in choices] |
|
if GSM8K_TASK_GROUP_NAME not in value: |
|
value += [GSM8K_TASK_GROUP_NAME] |
|
elif TAB_STATE == 1: |
|
value = [v for v in tasks if v in choices] |
|
shown_tasks = gr.CheckboxGroup( |
|
choices=choices, |
|
value=value, |
|
label="Select tasks to show", |
|
elem_id="column-select", |
|
interactive=True, |
|
scale=50, |
|
) |
|
return shown_tasks |
|
|
|
|
|
def update_tab_tasks(id: int, fewshot: bool = False): |
|
|
|
global TAB_STATE |
|
TAB_STATE = id |
|
selected_task_type = get_selected_task_type(TAB_STATE) |
|
choices = task_groups_with_task_type(selected_task_type) |
|
if not fewshot: |
|
try: |
|
choices.remove(GSM8K_TASK_GROUP_NAME) |
|
except ValueError: |
|
pass |
|
else: |
|
try: |
|
choices.remove(BELEBELE_TASK_GROUP_NAME) |
|
except ValueError: |
|
pass |
|
|
|
values = choices.copy() |
|
shown_tasks = gr.CheckboxGroup( |
|
choices=choices, |
|
value=values, |
|
label="Select tasks to show", |
|
elem_id="column-select", |
|
interactive=True, |
|
scale=50, |
|
) |
|
if id == 0: |
|
|
|
fewshot = gr.Radio( |
|
choices=[("0-Shot", False), ("Few-shot", True)], |
|
value=True, |
|
label="Select evaluation type", |
|
interactive=True, |
|
scale=29, |
|
) |
|
elif id == 1: |
|
|
|
fewshot = gr.Radio( |
|
choices=[("0-Shot", False), ("Few-shot", True)], |
|
value=False, |
|
label="Select evaluation type", |
|
interactive=False, |
|
scale=29, |
|
) |
|
return [shown_tasks, fewshot] |
|
|
|
|
|
def get_selected_task_type(task_type_id): |
|
task_types = {0: "accuracy", 1: "misc"} |
|
selected_task_type = task_types[task_type_id] |
|
return selected_task_type |
|
|
|
|
|
def task_groups_with_task_type(selected_task_type): |
|
choices = [task_group_name for task_group_name, task_type in task_group_type_dict.items() if task_type == selected_task_type] |
|
|
|
return choices |
|
|
|
|
|
init() |
|
|