Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
disable psc_g; rag avg
Browse files- src/about.py +1 -1
- src/display/utils.py +3 -0
- src/leaderboard/read_evals.py +7 -0
src/about.py
CHANGED
@@ -28,7 +28,7 @@ class Tasks(Enum):
|
|
28 |
task12 = Task("polish_ppc_multiple_choice", "acc,none", "ppc_mc", "multiple_choice", 0.419)
|
29 |
task13 = Task("polish_ppc_regex", "exact_match,score-first", "ppc_g", "generate_until", 0.419)
|
30 |
task14 = Task("polish_psc_multiple_choice", "f1,none", "psc_mc", "multiple_choice", 0.466)
|
31 |
-
task15 = Task("polish_psc_regex", "f1,score-first", "psc_g", "generate_until", 0.466)
|
32 |
task16 = Task("polish_cbd_multiple_choice", "f1,none", "cbd_mc", "multiple_choice", 0.149)
|
33 |
task17 = Task("polish_cbd_regex", "f1,score-first", "cbd_g", "generate_until", 0.149)
|
34 |
task18 = Task("polish_klej_ner_multiple_choice", "acc,none", "klej_ner_mc", "multiple_choice", 0.343)
|
|
|
28 |
task12 = Task("polish_ppc_multiple_choice", "acc,none", "ppc_mc", "multiple_choice", 0.419)
|
29 |
task13 = Task("polish_ppc_regex", "exact_match,score-first", "ppc_g", "generate_until", 0.419)
|
30 |
task14 = Task("polish_psc_multiple_choice", "f1,none", "psc_mc", "multiple_choice", 0.466)
|
31 |
+
# task15 = Task("polish_psc_regex", "f1,score-first", "psc_g", "generate_until", 0.466) # disabled until recalculation
|
32 |
task16 = Task("polish_cbd_multiple_choice", "f1,none", "cbd_mc", "multiple_choice", 0.149)
|
33 |
task17 = Task("polish_cbd_regex", "f1,score-first", "cbd_g", "generate_until", 0.149)
|
34 |
task18 = Task("polish_klej_ner_multiple_choice", "acc,none", "klej_ner_mc", "multiple_choice", 0.343)
|
src/display/utils.py
CHANGED
@@ -34,9 +34,12 @@ auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average
|
|
34 |
auto_eval_column_dict.append(["average_old", ColumnContent, ColumnContent("Average old", "number", False)])
|
35 |
auto_eval_column_dict.append(["average_g", ColumnContent, ColumnContent("Avg g", "number", True)])
|
36 |
auto_eval_column_dict.append(["average_mc", ColumnContent, ColumnContent("Avg mc", "number", True)])
|
|
|
37 |
for task in Tasks:
|
38 |
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
39 |
# Model information
|
|
|
|
|
40 |
auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
|
41 |
auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
|
42 |
auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
|
|
|
34 |
auto_eval_column_dict.append(["average_old", ColumnContent, ColumnContent("Average old", "number", False)])
|
35 |
auto_eval_column_dict.append(["average_g", ColumnContent, ColumnContent("Avg g", "number", True)])
|
36 |
auto_eval_column_dict.append(["average_mc", ColumnContent, ColumnContent("Avg mc", "number", True)])
|
37 |
+
|
38 |
for task in Tasks:
|
39 |
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
40 |
# Model information
|
41 |
+
auto_eval_column_dict.append(["average_rag", ColumnContent, ColumnContent("Avg RAG", "number", True)])
|
42 |
+
|
43 |
auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
|
44 |
auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
|
45 |
auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
|
src/leaderboard/read_evals.py
CHANGED
@@ -166,6 +166,7 @@ class EvalResult:
|
|
166 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
167 |
g_tasks = [task.value.benchmark for task in Tasks if task.value.type == "generate_until"]
|
168 |
mc_tasks = [task.value.benchmark for task in Tasks if task.value.type == "multiple_choice"]
|
|
|
169 |
all_tasks = g_tasks + mc_tasks
|
170 |
all_tasks_wo_polqa = [task for task in all_tasks if 'polqa' not in task]
|
171 |
|
@@ -188,6 +189,7 @@ class EvalResult:
|
|
188 |
average = sum([(self.results.get(task,0) - baselines.get(task, 0)) / (100 - baselines.get(task, 0)) * 100 for task in all_tasks]) / len(all_tasks)
|
189 |
average_g = sum([(self.results.get(task,0) - baselines.get(task, 0)) / (100 - baselines.get(task, 0)) * 100 for task in g_tasks]) / len(g_tasks)
|
190 |
average_mc = sum([(self.results.get(task,0) - baselines.get(task, 0)) / (100 - baselines.get(task, 0)) * 100 for task in mc_tasks]) / len(mc_tasks)
|
|
|
191 |
|
192 |
data_dict = {}
|
193 |
# data_dict = {
|
@@ -280,6 +282,11 @@ class EvalResult:
|
|
280 |
except KeyError:
|
281 |
print(f"Could not find average_mc")
|
282 |
|
|
|
|
|
|
|
|
|
|
|
283 |
try:
|
284 |
data_dict[AutoEvalColumn.license.name] = self.license
|
285 |
except KeyError:
|
|
|
166 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
167 |
g_tasks = [task.value.benchmark for task in Tasks if task.value.type == "generate_until"]
|
168 |
mc_tasks = [task.value.benchmark for task in Tasks if task.value.type == "multiple_choice"]
|
169 |
+
rag_tasks = ['polish_polqa_reranking_multiple_choice', 'polish_polqa_open_book']
|
170 |
all_tasks = g_tasks + mc_tasks
|
171 |
all_tasks_wo_polqa = [task for task in all_tasks if 'polqa' not in task]
|
172 |
|
|
|
189 |
average = sum([(self.results.get(task,0) - baselines.get(task, 0)) / (100 - baselines.get(task, 0)) * 100 for task in all_tasks]) / len(all_tasks)
|
190 |
average_g = sum([(self.results.get(task,0) - baselines.get(task, 0)) / (100 - baselines.get(task, 0)) * 100 for task in g_tasks]) / len(g_tasks)
|
191 |
average_mc = sum([(self.results.get(task,0) - baselines.get(task, 0)) / (100 - baselines.get(task, 0)) * 100 for task in mc_tasks]) / len(mc_tasks)
|
192 |
+
average_rag = sum([(self.results.get(task,0) - baselines.get(task, 0)) / (100 - baselines.get(task, 0)) * 100 for task in rag_tasks]) / len(rag_tasks)
|
193 |
|
194 |
data_dict = {}
|
195 |
# data_dict = {
|
|
|
282 |
except KeyError:
|
283 |
print(f"Could not find average_mc")
|
284 |
|
285 |
+
try:
|
286 |
+
data_dict[AutoEvalColumn.average_rag.name] = average_rag
|
287 |
+
except KeyError:
|
288 |
+
print(f"Could not find average_rag")
|
289 |
+
|
290 |
try:
|
291 |
data_dict[AutoEvalColumn.license.name] = self.license
|
292 |
except KeyError:
|