yangheng jcole1 commited on
Commit
d96e80d
1 Parent(s): a60a695

Update populate to add rank and model information (#2)

Browse files

- Update populate to add rank and model information (5f986f5a6e02e7902fbdb271bd9958ee9296a14d)


Co-authored-by: Jack Cole <jcole1@users.noreply.huggingface.co>

Files changed (1) hide show
  1. src/populate.py +12 -5
src/populate.py CHANGED
@@ -1,8 +1,9 @@
1
  import json
2
  import os
3
-
4
  import pandas as pd
5
 
 
6
  from src.display.formatting import has_no_nan_values, make_clickable_model
7
  from src.display.utils import AutoEvalColumn, EvalQueueColumn
8
  from src.leaderboard.read_evals import get_raw_eval_results
@@ -11,15 +12,21 @@ from src.leaderboard.read_evals import get_raw_eval_results
11
  def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
12
  """Creates a dataframe from all the individual experiment results"""
13
  raw_data = get_raw_eval_results(results_path, requests_path)
14
- all_data_json = [v.to_dict() for v in raw_data]
 
 
 
 
 
15
 
16
  df = pd.DataFrame.from_records(all_data_json)
17
- df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
18
  df = df[cols].round(decimals=2)
19
 
20
  # filter out if any of the benchmarks have not been produced
21
  df = df[has_no_nan_values(df, benchmark_cols)]
22
- return raw_data, df
 
23
 
24
 
25
  def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
@@ -55,4 +62,4 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
55
  df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
56
  df_running = pd.DataFrame.from_records(running_list, columns=cols)
57
  df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
58
- return df_finished[cols], df_running[cols], df_pending[cols]
 
1
  import json
2
  import os
3
+ import numpy as np
4
  import pandas as pd
5
 
6
+
7
  from src.display.formatting import has_no_nan_values, make_clickable_model
8
  from src.display.utils import AutoEvalColumn, EvalQueueColumn
9
  from src.leaderboard.read_evals import get_raw_eval_results
 
12
  def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
13
  """Creates a dataframe from all the individual experiment results"""
14
  raw_data = get_raw_eval_results(results_path, requests_path)
15
+ for result in raw_data:
16
+ result.average = np.mean(list(result.results.values()))
17
+ sorted_results = sorted(raw_data, key=lambda r: r.average, reverse=True)
18
+ # ranks = [rank+1 for rank, value in enumerate(sorted_results)]
19
+ # rank = [rank+1 for rank, value in enumerate(average)]
20
+ all_data_json = [v.to_dict(i+1) for i, v in enumerate(raw_data)]
21
 
22
  df = pd.DataFrame.from_records(all_data_json)
23
+ # df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
24
  df = df[cols].round(decimals=2)
25
 
26
  # filter out if any of the benchmarks have not been produced
27
  df = df[has_no_nan_values(df, benchmark_cols)]
28
+ print(df)
29
+ return df
30
 
31
 
32
  def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
 
62
  df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
63
  df_running = pd.DataFrame.from_records(running_list, columns=cols)
64
  df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
65
+ return df_finished[cols], df_running[cols], df_pending[cols]