Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
add perplexity
Browse files- app.py +1 -1
- src/about.py +3 -0
- src/display/utils.py +1 -1
- src/leaderboard/read_evals.py +27 -4
app.py
CHANGED
@@ -246,7 +246,7 @@ with demo:
|
|
246 |
interactive=False,
|
247 |
visible=True,
|
248 |
# column_widths=["2%", "33%"]
|
249 |
-
height=
|
250 |
)
|
251 |
|
252 |
# Dummy leaderboard for handling the case when the user uses backspace key
|
|
|
246 |
interactive=False,
|
247 |
visible=True,
|
248 |
# column_widths=["2%", "33%"]
|
249 |
+
height=800
|
250 |
)
|
251 |
|
252 |
# Dummy leaderboard for handling the case when the user uses backspace key
|
src/about.py
CHANGED
@@ -20,6 +20,7 @@ class Tasks(Enum):
|
|
20 |
task6 = Task("polemo2_out_multiple_choice", "acc,none", "polemo2-out_mc", "multiple_choice")
|
21 |
task7 = Task("polish_8tags_multiple_choice", "acc,none", "8tags_mc", "multiple_choice")
|
22 |
task8 = Task("polish_8tags_regex", "exact_match,score-first", "8tags_g", "generate_until")
|
|
|
23 |
task9 = Task("polish_belebele_regex", "exact_match,score-first", "belebele_g", "generate_until")
|
24 |
task10 = Task("polish_dyk_multiple_choice", "f1,none", "dyk_mc", "multiple_choice")
|
25 |
task11 = Task("polish_dyk_regex", "f1,score-first", "dyk_g", "generate_until")
|
@@ -31,6 +32,7 @@ class Tasks(Enum):
|
|
31 |
task17 = Task("polish_cbd_regex", "f1,score-first", "cbd_g", "generate_until")
|
32 |
task18 = Task("polish_klej_ner_multiple_choice", "acc,none", "klej_ner_mc", "multiple_choice")
|
33 |
task19 = Task("polish_klej_ner_regex", "exact_match,score-first", "klej_ner_g", "generate_until")
|
|
|
34 |
|
35 |
NUM_FEWSHOT = 0 # Change with your few shot
|
36 |
# ---------------------------------------------------
|
@@ -72,6 +74,7 @@ or join our [Discord SpeakLeash](https://discord.gg/3G9DVM39)
|
|
72 |
* add metadata for models (e.g. #Params)
|
73 |
* add more tasks
|
74 |
* use model templates
|
|
|
75 |
|
76 |
## Tasks
|
77 |
|
|
|
20 |
task6 = Task("polemo2_out_multiple_choice", "acc,none", "polemo2-out_mc", "multiple_choice")
|
21 |
task7 = Task("polish_8tags_multiple_choice", "acc,none", "8tags_mc", "multiple_choice")
|
22 |
task8 = Task("polish_8tags_regex", "exact_match,score-first", "8tags_g", "generate_until")
|
23 |
+
#task9a = Task("polish_belebele_mc", "acc,none", "belebele_mc", "multiple_choice")
|
24 |
task9 = Task("polish_belebele_regex", "exact_match,score-first", "belebele_g", "generate_until")
|
25 |
task10 = Task("polish_dyk_multiple_choice", "f1,none", "dyk_mc", "multiple_choice")
|
26 |
task11 = Task("polish_dyk_regex", "f1,score-first", "dyk_g", "generate_until")
|
|
|
32 |
task17 = Task("polish_cbd_regex", "f1,score-first", "cbd_g", "generate_until")
|
33 |
task18 = Task("polish_klej_ner_multiple_choice", "acc,none", "klej_ner_mc", "multiple_choice")
|
34 |
task19 = Task("polish_klej_ner_regex", "exact_match,score-first", "klej_ner_g", "generate_until")
|
35 |
+
task20 = Task("polish_poleval2018_task3_test_10k", "word_perplexity,none", "polish_poleval2018_task3_test_10k", "other")
|
36 |
|
37 |
NUM_FEWSHOT = 0 # Change with your few shot
|
38 |
# ---------------------------------------------------
|
|
|
74 |
* add metadata for models (e.g. #Params)
|
75 |
* add more tasks
|
76 |
* use model templates
|
77 |
+
* fix scrolling on Firefox
|
78 |
|
79 |
## Tasks
|
80 |
|
src/display/utils.py
CHANGED
@@ -26,6 +26,7 @@ auto_eval_column_dict = []
|
|
26 |
# Init
|
27 |
auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
|
28 |
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
|
|
29 |
auto_eval_column_dict.append(["n_shot", ColumnContent, ColumnContent("n_shot", "str", True)])
|
30 |
#Scores
|
31 |
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
|
@@ -39,7 +40,6 @@ auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Arch
|
|
39 |
auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
|
40 |
auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
|
41 |
auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
|
42 |
-
auto_eval_column_dict.append(["lang", ColumnContent, ColumnContent("Lang", "str", True)])
|
43 |
auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", True)])
|
44 |
auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
|
45 |
auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
|
|
|
26 |
# Init
|
27 |
auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
|
28 |
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
29 |
+
auto_eval_column_dict.append(["lang", ColumnContent, ColumnContent("Lang", "str", True)])
|
30 |
auto_eval_column_dict.append(["n_shot", ColumnContent, ColumnContent("n_shot", "str", True)])
|
31 |
#Scores
|
32 |
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
|
|
|
40 |
auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
|
41 |
auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
|
42 |
auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
|
|
|
43 |
auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", True)])
|
44 |
auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
|
45 |
auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
|
src/leaderboard/read_evals.py
CHANGED
@@ -33,6 +33,7 @@ class EvalResult:
|
|
33 |
date: str = "" # submission date of request file
|
34 |
still_on_hub: bool = False
|
35 |
n_shot: NShotType = NShotType.n0
|
|
|
36 |
|
37 |
@classmethod
|
38 |
def init_from_json_file(self, json_filepath, n_shot_num):
|
@@ -48,6 +49,7 @@ class EvalResult:
|
|
48 |
|
49 |
# Get model and org
|
50 |
org_and_model = config.get("model_name", config.get("model_args", None))
|
|
|
51 |
SPICHLERZ_ORG = "speakleash/"
|
52 |
|
53 |
if re.match(r"^pretrained=(.*/(plgkwrobel|plggspkl)/)(models/)?", org_and_model):
|
@@ -91,7 +93,10 @@ class EvalResult:
|
|
91 |
if accs.size == 0 or any([acc is None for acc in accs]):
|
92 |
continue
|
93 |
|
94 |
-
|
|
|
|
|
|
|
95 |
results[task.benchmark] = mean_acc
|
96 |
|
97 |
return self(
|
@@ -104,7 +109,8 @@ class EvalResult:
|
|
104 |
revision= config.get("model_sha", ""),
|
105 |
still_on_hub=still_on_hub,
|
106 |
architecture=architecture,
|
107 |
-
n_shot=NShotType.from_str(n_shot_num)
|
|
|
108 |
)
|
109 |
|
110 |
def update_with_metadata(self, metadata):
|
@@ -139,10 +145,10 @@ class EvalResult:
|
|
139 |
|
140 |
def to_dict(self):
|
141 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
142 |
-
average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
|
143 |
g_tasks = [task.value.benchmark for task in Tasks if task.value.type == "generate_until"]
|
144 |
mc_tasks = [task.value.benchmark for task in Tasks if task.value.type == "multiple_choice"]
|
145 |
-
|
|
|
146 |
average_g = sum([v for task,v in self.results.items() if v is not None and task in g_tasks]) / len(g_tasks)
|
147 |
average_mc = sum([v for task,v in self.results.items() if v is not None and task in mc_tasks]) / len(mc_tasks)
|
148 |
|
@@ -352,4 +358,21 @@ def get_raw_eval_results(results_path: str, requests_path: str, metadata) -> lis
|
|
352 |
print(f"not all eval values present {v.eval_name} {v.full_model}")
|
353 |
continue
|
354 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
355 |
return results
|
|
|
33 |
date: str = "" # submission date of request file
|
34 |
still_on_hub: bool = False
|
35 |
n_shot: NShotType = NShotType.n0
|
36 |
+
org_and_model: str = ""
|
37 |
|
38 |
@classmethod
|
39 |
def init_from_json_file(self, json_filepath, n_shot_num):
|
|
|
49 |
|
50 |
# Get model and org
|
51 |
org_and_model = config.get("model_name", config.get("model_args", None))
|
52 |
+
orig_org_and_model = org_and_model
|
53 |
SPICHLERZ_ORG = "speakleash/"
|
54 |
|
55 |
if re.match(r"^pretrained=(.*/(plgkwrobel|plggspkl)/)(models/)?", org_and_model):
|
|
|
93 |
if accs.size == 0 or any([acc is None for acc in accs]):
|
94 |
continue
|
95 |
|
96 |
+
if 'perplexity' in task.metric:
|
97 |
+
mean_acc = np.mean(accs)
|
98 |
+
else:
|
99 |
+
mean_acc = np.mean(accs) * 100.0
|
100 |
results[task.benchmark] = mean_acc
|
101 |
|
102 |
return self(
|
|
|
109 |
revision= config.get("model_sha", ""),
|
110 |
still_on_hub=still_on_hub,
|
111 |
architecture=architecture,
|
112 |
+
n_shot=NShotType.from_str(n_shot_num),
|
113 |
+
org_and_model=orig_org_and_model
|
114 |
)
|
115 |
|
116 |
def update_with_metadata(self, metadata):
|
|
|
145 |
|
146 |
def to_dict(self):
|
147 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
|
|
148 |
g_tasks = [task.value.benchmark for task in Tasks if task.value.type == "generate_until"]
|
149 |
mc_tasks = [task.value.benchmark for task in Tasks if task.value.type == "multiple_choice"]
|
150 |
+
all_tasks = g_tasks + mc_tasks
|
151 |
+
average = sum([v for task,v in self.results.items() if v is not None and task in all_tasks]) / len(all_tasks)
|
152 |
average_g = sum([v for task,v in self.results.items() if v is not None and task in g_tasks]) / len(g_tasks)
|
153 |
average_mc = sum([v for task,v in self.results.items() if v is not None and task in mc_tasks]) / len(mc_tasks)
|
154 |
|
|
|
358 |
print(f"not all eval values present {v.eval_name} {v.full_model}")
|
359 |
continue
|
360 |
|
361 |
+
missing_results_for_task = {}
|
362 |
+
for v in eval_results.values():
|
363 |
+
r = v.to_dict()
|
364 |
+
for task in Tasks:
|
365 |
+
if r[task.value.col_name] is None:
|
366 |
+
task_name = f"{r['n_shot']}|{task.value.benchmark}"
|
367 |
+
if task_name in missing_results_for_task:
|
368 |
+
missing_results_for_task[task_name].append(f"{v.full_model}|{v.org_and_model}")
|
369 |
+
else:
|
370 |
+
missing_results_for_task[task_name] = [f"{v.full_model}|{v.org_and_model}"]
|
371 |
+
|
372 |
+
# print('missing_results_for_task', missing_results_for_task)
|
373 |
+
for task, models in missing_results_for_task.items():
|
374 |
+
print(f"Missing results for {task} for {len(models)} models")
|
375 |
+
print(" ".join(models))
|
376 |
+
|
377 |
+
|
378 |
return results
|