Merge pull request #11 from OpenGPTX/fix/belebele_fewshot
Browse filesDon't show belebele in fewshot mode; Don't show truthfulqa in zero shot setting; Remove Plots
app.py
CHANGED
@@ -104,8 +104,6 @@ with demo:
|
|
104 |
id=1,
|
105 |
) as misc:
|
106 |
leaderboard_table_misc = gr.Dataframe()
|
107 |
-
with gr.TabItem("Plots", elem_id="llm-plot-tab", id=2) as plot:
|
108 |
-
leaderboard_plot = gr.Plot(elem_id="plot")
|
109 |
acc.select(
|
110 |
lambda x: core.update_tab_tasks(0, x),
|
111 |
inputs=fewshot,
|
@@ -133,11 +131,7 @@ with demo:
|
|
133 |
[shown_tasks, search_bar, langs_bar, model_types, fewshot],
|
134 |
leaderboard_table_misc,
|
135 |
)
|
136 |
-
|
137 |
-
core.update_plot,
|
138 |
-
[shown_tasks, search_bar, langs_bar, model_types, fewshot],
|
139 |
-
leaderboard_plot,
|
140 |
-
)
|
141 |
|
142 |
gr.Blocks.load(
|
143 |
block=demo,
|
@@ -153,11 +147,4 @@ with demo:
|
|
153 |
outputs=leaderboard_table_misc,
|
154 |
)
|
155 |
|
156 |
-
gr.Blocks.load(
|
157 |
-
block=demo,
|
158 |
-
fn=core.update_plot,
|
159 |
-
inputs=[shown_tasks, search_bar, langs_bar, model_types, fewshot],
|
160 |
-
outputs=leaderboard_plot,
|
161 |
-
)
|
162 |
-
|
163 |
demo.launch()
|
|
|
104 |
id=1,
|
105 |
) as misc:
|
106 |
leaderboard_table_misc = gr.Dataframe()
|
|
|
|
|
107 |
acc.select(
|
108 |
lambda x: core.update_tab_tasks(0, x),
|
109 |
inputs=fewshot,
|
|
|
131 |
[shown_tasks, search_bar, langs_bar, model_types, fewshot],
|
132 |
leaderboard_table_misc,
|
133 |
)
|
134 |
+
|
|
|
|
|
|
|
|
|
135 |
|
136 |
gr.Blocks.load(
|
137 |
block=demo,
|
|
|
147 |
outputs=leaderboard_table_misc,
|
148 |
)
|
149 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
150 |
demo.launch()
|
core.py
CHANGED
@@ -10,8 +10,8 @@ from datasets import load_dataset
|
|
10 |
import style
|
11 |
|
12 |
TAB_STATE = 0 # FIXME
|
13 |
-
|
14 |
-
|
15 |
|
16 |
|
17 |
def init():
|
@@ -45,10 +45,6 @@ def init():
|
|
45 |
|
46 |
def sort_cols(df: pd.DataFrame, fewshot: bool = False) -> pd.DataFrame:
|
47 |
task_cols = get_task_columns(df)
|
48 |
-
if fewshot:
|
49 |
-
renamer = {col: f"{col} ({task_groups_shots_dict[col]}-shot)" for col in task_cols if col in task_groups_shots_dict}
|
50 |
-
df.rename(columns=renamer, inplace=True)
|
51 |
-
task_cols = renamer.values()
|
52 |
return df.reindex(["Type", "Model_Name", "Average"] + sorted(task_cols), axis=1)
|
53 |
|
54 |
|
@@ -125,59 +121,24 @@ def update_df(
|
|
125 |
df = filter_type(df, model_types)
|
126 |
|
127 |
if format:
|
128 |
-
return sort_cols(df, fewshot).style.format(precision=2, decimal=".")
|
129 |
else:
|
130 |
return sort_cols(df, fewshot)
|
131 |
|
132 |
|
133 |
-
def make_plot(df: pd.DataFrame):
|
134 |
-
df.columns = df.loc["Model_Name"]
|
135 |
-
df = df.drop("Model_Name")
|
136 |
-
df = df.reset_index(names="task")
|
137 |
-
if len(df.columns) > 2:
|
138 |
-
fig = px.line(data_frame=df, x="task", y=df.columns, markers=True, width=1200)
|
139 |
-
else:
|
140 |
-
fig = px.bar(data_frame=df, x="task", y=df.columns[-1], width=1200)
|
141 |
-
fig.update_xaxes(type="category")
|
142 |
-
return fig
|
143 |
-
|
144 |
-
|
145 |
-
def update_plot(
|
146 |
-
tasks: list[str],
|
147 |
-
model_query: str,
|
148 |
-
langs: list[str],
|
149 |
-
model_types: list[str],
|
150 |
-
fewshot: bool = False,
|
151 |
-
):
|
152 |
-
df = update_df(tasks, model_query, langs, model_types, fewshot, False).transpose()
|
153 |
-
plot = make_plot(df)
|
154 |
-
return plot
|
155 |
-
|
156 |
-
|
157 |
def fix_zeroshot(tasks: list[str | int | float], fewshot: bool = False):
|
158 |
global TAB_STATE
|
159 |
selected_task_type = get_selected_task_type(TAB_STATE)
|
160 |
choices = task_groups_with_task_type(selected_task_type)
|
161 |
if not fewshot:
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
pass
|
166 |
-
if TAB_STATE == 0:
|
167 |
-
value = [v for v in tasks if v in choices]
|
168 |
-
if BELEBELE_TASK_GROUP_NAME not in value:
|
169 |
-
value += [BELEBELE_TASK_GROUP_NAME]
|
170 |
-
elif TAB_STATE == 1:
|
171 |
-
value = [v for v in tasks if v in choices]
|
172 |
else:
|
173 |
-
try:
|
174 |
-
choices.remove(BELEBELE_TASK_GROUP_NAME)
|
175 |
-
except ValueError:
|
176 |
-
pass
|
177 |
if TAB_STATE == 0:
|
|
|
178 |
value = [v for v in tasks if v in choices]
|
179 |
-
if
|
180 |
-
value += [GSM8K_TASK_GROUP_NAME]
|
181 |
elif TAB_STATE == 1:
|
182 |
value = [v for v in tasks if v in choices]
|
183 |
shown_tasks = gr.CheckboxGroup(
|
@@ -198,16 +159,7 @@ def update_tab_tasks(id: int, fewshot: bool = False):
|
|
198 |
selected_task_type = get_selected_task_type(TAB_STATE)
|
199 |
choices = task_groups_with_task_type(selected_task_type)
|
200 |
if not fewshot:
|
201 |
-
|
202 |
-
choices.remove(GSM8K_TASK_GROUP_NAME)
|
203 |
-
except ValueError:
|
204 |
-
pass
|
205 |
-
else:
|
206 |
-
try:
|
207 |
-
choices.remove(BELEBELE_TASK_GROUP_NAME)
|
208 |
-
except ValueError:
|
209 |
-
pass
|
210 |
-
|
211 |
values = choices.copy()
|
212 |
shown_tasks = gr.CheckboxGroup(
|
213 |
choices=choices,
|
|
|
10 |
import style
|
11 |
|
12 |
TAB_STATE = 0 # FIXME
|
13 |
+
NO_FEWSHOT = ["BELEBELE"] # FIXME
|
14 |
+
NO_ZEROSHOT = ["GSM8K", "TruthfulQA"] # FIXME
|
15 |
|
16 |
|
17 |
def init():
|
|
|
45 |
|
46 |
def sort_cols(df: pd.DataFrame, fewshot: bool = False) -> pd.DataFrame:
|
47 |
task_cols = get_task_columns(df)
|
|
|
|
|
|
|
|
|
48 |
return df.reindex(["Type", "Model_Name", "Average"] + sorted(task_cols), axis=1)
|
49 |
|
50 |
|
|
|
121 |
df = filter_type(df, model_types)
|
122 |
|
123 |
if format:
|
124 |
+
return sort_cols(df, fewshot).style.format(precision=2, decimal=".", na_rep="N/A")
|
125 |
else:
|
126 |
return sort_cols(df, fewshot)
|
127 |
|
128 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
129 |
def fix_zeroshot(tasks: list[str | int | float], fewshot: bool = False):
|
130 |
global TAB_STATE
|
131 |
selected_task_type = get_selected_task_type(TAB_STATE)
|
132 |
choices = task_groups_with_task_type(selected_task_type)
|
133 |
if not fewshot:
|
134 |
+
choices = [c for c in choices if c not in NO_ZEROSHOT]
|
135 |
+
value = [v for v in tasks if v in choices]
|
136 |
+
value += [t for t in NO_FEWSHOT if t not in value]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
137 |
else:
|
|
|
|
|
|
|
|
|
138 |
if TAB_STATE == 0:
|
139 |
+
choices = [c for c in choices if c not in NO_FEWSHOT]
|
140 |
value = [v for v in tasks if v in choices]
|
141 |
+
value += [t for t in NO_ZEROSHOT if t not in value]
|
|
|
142 |
elif TAB_STATE == 1:
|
143 |
value = [v for v in tasks if v in choices]
|
144 |
shown_tasks = gr.CheckboxGroup(
|
|
|
159 |
selected_task_type = get_selected_task_type(TAB_STATE)
|
160 |
choices = task_groups_with_task_type(selected_task_type)
|
161 |
if not fewshot:
|
162 |
+
choices = [c for c in choices if c not in NO_ZEROSHOT]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
163 |
values = choices.copy()
|
164 |
shown_tasks = gr.CheckboxGroup(
|
165 |
choices=choices,
|