jcole1 commited on
Commit
0520d15
1 Parent(s): f02c8f8

Update read evals to read the correct information

Browse files

Previous read evals read in the wrong data with invalid naming information. We update this so that it works correctly with the right formatting.

Files changed (1) hide show
  1. src/leaderboard/read_evals.py +20 -8
src/leaderboard/read_evals.py CHANGED
@@ -60,6 +60,7 @@ class EvalResult:
60
  still_on_hub, _, model_config = is_model_on_hub(
61
  full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
62
  )
 
63
  architecture = "?"
64
  if model_config is not None:
65
  architectures = getattr(model_config, "architectures", None)
@@ -70,13 +71,15 @@ class EvalResult:
70
  results = {}
71
  for task in Tasks:
72
  task = task.value
73
-
74
  # We average all scores of a given metric (not all metrics are present in all files)
75
  accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
76
  if accs.size == 0 or any([acc is None for acc in accs]):
77
  continue
78
-
79
- mean_acc = np.mean(accs) * 100.0
 
 
 
80
  results[task.benchmark] = mean_acc
81
 
82
  return self(
@@ -93,8 +96,8 @@ class EvalResult:
93
 
94
  def update_with_request_file(self, requests_path):
95
  """Finds the relevant request file for the current model and updates info with it"""
 
96
  request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
97
-
98
  try:
99
  with open(request_file, "r") as f:
100
  request = json.load(f)
@@ -107,9 +110,11 @@ class EvalResult:
107
  except Exception:
108
  print(f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}")
109
 
110
- def to_dict(self):
111
  """Converts the Eval Result to a dict compatible with our dataframe display"""
112
- average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
 
 
113
  data_dict = {
114
  "eval_name": self.eval_name, # not a column, just a save name,
115
  AutoEvalColumn.precision.name: self.precision.value.name,
@@ -138,6 +143,7 @@ def get_request_file_for_model(requests_path, model_name, precision):
138
  requests_path,
139
  f"{model_name}_eval_request_*.json",
140
  )
 
141
  request_files = glob.glob(request_files)
142
 
143
  # Select correct request file (precision)
@@ -146,6 +152,8 @@ def get_request_file_for_model(requests_path, model_name, precision):
146
  for tmp_request_file in request_files:
147
  with open(tmp_request_file, "r") as f:
148
  req_content = json.load(f)
 
 
149
  if (
150
  req_content["status"] in ["FINISHED"]
151
  and req_content["precision"] == precision.split(".")[-1]
@@ -186,9 +194,13 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
186
  eval_results[eval_name] = eval_result
187
 
188
  results = []
189
- for v in eval_results.values():
 
 
 
 
190
  try:
191
- v.to_dict() # we test if the dict version is complete
192
  results.append(v)
193
  except KeyError: # not all eval values present
194
  continue
 
60
  still_on_hub, _, model_config = is_model_on_hub(
61
  full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
62
  )
63
+ print("Is model on hub? \n", _)
64
  architecture = "?"
65
  if model_config is not None:
66
  architectures = getattr(model_config, "architectures", None)
 
71
  results = {}
72
  for task in Tasks:
73
  task = task.value
 
74
  # We average all scores of a given metric (not all metrics are present in all files)
75
  accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
76
  if accs.size == 0 or any([acc is None for acc in accs]):
77
  continue
78
+ if task.benchmark == "mRNA":
79
+ # Keep RMSE at original value
80
+ mean_acc = np.mean(accs)
81
+ else:
82
+ mean_acc = np.mean(accs) * 100.0
83
  results[task.benchmark] = mean_acc
84
 
85
  return self(
 
96
 
97
  def update_with_request_file(self, requests_path):
98
  """Finds the relevant request file for the current model and updates info with it"""
99
+ # print("Requests Path: ", requests_path)
100
  request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
 
101
  try:
102
  with open(request_file, "r") as f:
103
  request = json.load(f)
 
110
  except Exception:
111
  print(f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}")
112
 
113
+ def to_dict(self, rank):
114
  """Converts the Eval Result to a dict compatible with our dataframe display"""
115
+ average = rank
116
+ # average = sorted(average, reverse=True)
117
+ # rank = [rank+1 for rank, value in enumerate(average)]
118
  data_dict = {
119
  "eval_name": self.eval_name, # not a column, just a save name,
120
  AutoEvalColumn.precision.name: self.precision.value.name,
 
143
  requests_path,
144
  f"{model_name}_eval_request_*.json",
145
  )
146
+ # print("Request Files: ", request_files)
147
  request_files = glob.glob(request_files)
148
 
149
  # Select correct request file (precision)
 
152
  for tmp_request_file in request_files:
153
  with open(tmp_request_file, "r") as f:
154
  req_content = json.load(f)
155
+ # print("Request File: ", tmp_request_file)
156
+ # print("Req Content: ", req_content)
157
  if (
158
  req_content["status"] in ["FINISHED"]
159
  and req_content["precision"] == precision.split(".")[-1]
 
194
  eval_results[eval_name] = eval_result
195
 
196
  results = []
197
+ for result in eval_results.values():
198
+ result.average = np.mean(list(result.results.values()))
199
+ sorted_results = sorted(eval_results.values(), key=lambda r: r.average, reverse=True)
200
+
201
+ for i,v in enumerate(sorted_results):
202
  try:
203
+ v.to_dict(i) # we test if the dict version is complete
204
  results.append(v)
205
  except KeyError: # not all eval values present
206
  continue