djstrong commited on
Commit
96fbe7c
1 Parent(s): 1bea7de

keep old Average

Browse files
Files changed (3) hide show
  1. app.py +1 -1
  2. src/display/utils.py +1 -0
  3. src/leaderboard/read_evals.py +7 -1
app.py CHANGED
@@ -76,7 +76,7 @@ def style_df(df: pd.DataFrame) -> Styler:
76
  rounding = {'#Params (B)': "{:.1f}"}
77
  for task in Tasks:
78
  rounding[task.value.col_name] = "{:.2f}"
79
- for column_name in ["Average ⬆️", "Avg g", "Avg mc"]:
80
  rounding[column_name] = "{:.2f}"
81
  leaderboard_df_styled = leaderboard_df_styled.format(rounding)
82
  return leaderboard_df_styled
 
76
  rounding = {'#Params (B)': "{:.1f}"}
77
  for task in Tasks:
78
  rounding[task.value.col_name] = "{:.2f}"
79
+ for column_name in ["Average ⬆️", "Avg g", "Avg mc", "Average old"]:
80
  rounding[column_name] = "{:.2f}"
81
  leaderboard_df_styled = leaderboard_df_styled.format(rounding)
82
  return leaderboard_df_styled
src/display/utils.py CHANGED
@@ -30,6 +30,7 @@ auto_eval_column_dict.append(["lang", ColumnContent, ColumnContent("Lang", "str"
30
  auto_eval_column_dict.append(["n_shot", ColumnContent, ColumnContent("n_shot", "str", True)])
31
  #Scores
32
  auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
 
33
  auto_eval_column_dict.append(["average_g", ColumnContent, ColumnContent("Avg g", "number", True)])
34
  auto_eval_column_dict.append(["average_mc", ColumnContent, ColumnContent("Avg mc", "number", True)])
35
  for task in Tasks:
 
30
  auto_eval_column_dict.append(["n_shot", ColumnContent, ColumnContent("n_shot", "str", True)])
31
  #Scores
32
  auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
33
+ auto_eval_column_dict.append(["average_old", ColumnContent, ColumnContent("Average old", "number", False)])
34
  auto_eval_column_dict.append(["average_g", ColumnContent, ColumnContent("Avg g", "number", True)])
35
  auto_eval_column_dict.append(["average_mc", ColumnContent, ColumnContent("Avg mc", "number", True)])
36
  for task in Tasks:
src/leaderboard/read_evals.py CHANGED
@@ -157,10 +157,11 @@ class EvalResult:
157
  g_tasks = [task.value.benchmark for task in Tasks if task.value.type == "generate_until"]
158
  mc_tasks = [task.value.benchmark for task in Tasks if task.value.type == "multiple_choice"]
159
  all_tasks = g_tasks + mc_tasks
 
160
 
161
  baselines = {task.value.benchmark: task.value.baseline*100 for task in Tasks}
162
 
163
- # average = sum([v for task, v in self.results.items() if v is not None and task in all_tasks]) / len(all_tasks)
164
  # average_g = sum([v for task, v in self.results.items() if v is not None and task in g_tasks]) / len(g_tasks)
165
  # average_mc = sum([v for task, v in self.results.items() if v is not None and task in mc_tasks]) / len(mc_tasks)
166
  # print('XXXXXXXXXXXX')
@@ -249,6 +250,11 @@ class EvalResult:
249
  except AttributeError:
250
  print(f"AttributeError revision")
251
 
 
 
 
 
 
252
  try:
253
  data_dict[AutoEvalColumn.average.name] = average
254
  except KeyError:
 
157
  g_tasks = [task.value.benchmark for task in Tasks if task.value.type == "generate_until"]
158
  mc_tasks = [task.value.benchmark for task in Tasks if task.value.type == "multiple_choice"]
159
  all_tasks = g_tasks + mc_tasks
160
+ all_tasks_wo_polqa = [task for task in all_tasks if 'polqa' not in task]
161
 
162
  baselines = {task.value.benchmark: task.value.baseline*100 for task in Tasks}
163
 
164
+ average_old = sum([v for task, v in self.results.items() if v is not None and task in all_tasks_wo_polqa]) / len(all_tasks_wo_polqa)
165
  # average_g = sum([v for task, v in self.results.items() if v is not None and task in g_tasks]) / len(g_tasks)
166
  # average_mc = sum([v for task, v in self.results.items() if v is not None and task in mc_tasks]) / len(mc_tasks)
167
  # print('XXXXXXXXXXXX')
 
250
  except AttributeError:
251
  print(f"AttributeError revision")
252
 
253
+ try:
254
+ data_dict[AutoEvalColumn.average_old.name] = average_old
255
+ except KeyError:
256
+ print(f"Could not find average_old")
257
+
258
  try:
259
  data_dict[AutoEvalColumn.average.name] = average
260
  except KeyError: