Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Sean Cho
commited on
Commit
•
bd9a9ad
1
Parent(s):
6f030e8
revert logic
Browse files- src/leaderboard/read_evals.py +29 -1
- src/populate.py +0 -1
- src/tools/plots.py +19 -1
src/leaderboard/read_evals.py
CHANGED
@@ -103,6 +103,13 @@ class EvalResult:
|
|
103 |
results[task.benchmark] = 0.0
|
104 |
continue
|
105 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
106 |
# We average all scores of a given metric (mostly for mmlu)
|
107 |
accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark in k])
|
108 |
if accs.size == 0 or any([acc is None for acc in accs]):
|
@@ -144,7 +151,28 @@ class EvalResult:
|
|
144 |
def to_dict(self):
|
145 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
146 |
|
147 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
148 |
data_dict = {
|
149 |
"eval_name": self.eval_name, # not a column, just a save name,
|
150 |
AutoEvalColumn.precision.name: self.precision.value.name,
|
|
|
103 |
results[task.benchmark] = 0.0
|
104 |
continue
|
105 |
|
106 |
+
# New tasks have been added, we need to skip them if not exists
|
107 |
+
if task.benchmark in ["ko_winogrande", "ko_gsm8k", "ko_eq_bench", "ko_inst_follow", "kor_nat_cka", "kor_nat_sva", "ko_harmlessness", "ko_helpfulness"]:
|
108 |
+
accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark in k])
|
109 |
+
if accs.size == 0 or any([acc is None for acc in accs]):
|
110 |
+
results[task.benchmark] = 0.0
|
111 |
+
continue
|
112 |
+
|
113 |
# We average all scores of a given metric (mostly for mmlu)
|
114 |
accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark in k])
|
115 |
if accs.size == 0 or any([acc is None for acc in accs]):
|
|
|
151 |
def to_dict(self):
|
152 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
153 |
|
154 |
+
# Skip the two new tasks for now
|
155 |
+
# TODO: safely remove this code when the task results are all added
|
156 |
+
skip_avg_len = 0
|
157 |
+
if self.results['ko_winogrande'] == 0.0:
|
158 |
+
skip_avg_len += 1
|
159 |
+
if self.results['ko_gsm8k'] == 0.0:
|
160 |
+
skip_avg_len += 1
|
161 |
+
if self.results['ko_eq_bench'] == 0.0:
|
162 |
+
skip_avg_len += 1
|
163 |
+
if self.results['ko_inst_follow'] == 0.0:
|
164 |
+
skip_avg_len += 1
|
165 |
+
if self.results['kor_nat_cka'] == 0.0:
|
166 |
+
skip_avg_len += 1
|
167 |
+
if self.results['kor_nat_sva'] == 0.0:
|
168 |
+
skip_avg_len += 1
|
169 |
+
if self.results['ko_harmlessness'] == 0.0:
|
170 |
+
skip_avg_len += 1
|
171 |
+
if self.results['ko_helpfulness'] == 0.0:
|
172 |
+
skip_avg_len += 1
|
173 |
+
|
174 |
+
average = sum([v for v in self.results.values() if v is not None]) / (len(Tasks) - skip_avg_len)
|
175 |
+
|
176 |
data_dict = {
|
177 |
"eval_name": self.eval_name, # not a column, just a save name,
|
178 |
AutoEvalColumn.precision.name: self.precision.value.name,
|
src/populate.py
CHANGED
@@ -16,7 +16,6 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
|
|
16 |
filter_models(all_data_json)
|
17 |
|
18 |
df = pd.DataFrame.from_records(all_data_json)
|
19 |
-
print(df.to_string())
|
20 |
df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
|
21 |
df = df[cols].round(decimals=2)
|
22 |
|
|
|
16 |
filter_models(all_data_json)
|
17 |
|
18 |
df = pd.DataFrame.from_records(all_data_json)
|
|
|
19 |
df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
|
20 |
df = df[cols].round(decimals=2)
|
21 |
|
src/tools/plots.py
CHANGED
@@ -36,7 +36,25 @@ def create_scores_df(raw_data: list[EvalResult]) -> pd.DataFrame:
|
|
36 |
|
37 |
current_date = row["date"]
|
38 |
if task.benchmark == "Average":
|
39 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
else:
|
41 |
current_score = row["results"][task.benchmark]
|
42 |
|
|
|
36 |
|
37 |
current_date = row["date"]
|
38 |
if task.benchmark == "Average":
|
39 |
+
avg_skip_len = 0
|
40 |
+
if row["results"]["ko_winogrande"] == 0.0:
|
41 |
+
avg_skip_len += 1
|
42 |
+
if row["results"]["ko_gsm8k"] == 0.0:
|
43 |
+
avg_skip_len += 1
|
44 |
+
if row["results"]["ko_eq_bench"] == 0.0:
|
45 |
+
avg_skip_len += 1
|
46 |
+
if row["results"]["ko_inst_follow"] == 0.0:
|
47 |
+
avg_skip_len += 1
|
48 |
+
if row["results"]["kor_nat_cka"] == 0.0:
|
49 |
+
avg_skip_len += 1
|
50 |
+
if row["results"]["kor_nat_sva"] == 0.0:
|
51 |
+
avg_skip_len += 1
|
52 |
+
if row["results"]["ko_harmlessness"] == 0.0:
|
53 |
+
avg_skip_len += 1
|
54 |
+
if row["results"]["ko_helpfulness"] == 0.0:
|
55 |
+
avg_skip_len += 1
|
56 |
+
|
57 |
+
current_score = np.sum(list(row["results"].values())) / (len(row["results"]) - avg_skip_len)
|
58 |
else:
|
59 |
current_score = row["results"][task.benchmark]
|
60 |
|