yangheng jcole1 commited on
Commit
e7cd420
1 Parent(s): c3e62f6

Update read evals to read the correct information (#4)

Browse files

- Update read evals to read the correct information (0520d1572db733766d6d9390c6d277ce45f22511)


Co-authored-by: Jack Cole <jcole1@users.noreply.huggingface.co>

Files changed (1) hide show
  1. src/leaderboard/read_evals.py +20 -8
src/leaderboard/read_evals.py CHANGED
@@ -60,6 +60,7 @@ class EvalResult:
60
  still_on_hub, _, model_config = is_model_on_hub(
61
  full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
62
  )
 
63
  architecture = "?"
64
  if model_config is not None:
65
  architectures = getattr(model_config, "architectures", None)
@@ -70,13 +71,15 @@ class EvalResult:
70
  results = {}
71
  for task in Tasks:
72
  task = task.value
73
-
74
  # We average all scores of a given metric (not all metrics are present in all files)
75
  accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
76
  if accs.size == 0 or any([acc is None for acc in accs]):
77
  continue
78
-
79
- mean_acc = np.mean(accs) * 100.0
 
 
 
80
  results[task.benchmark] = mean_acc
81
 
82
  return self(
@@ -93,8 +96,8 @@ class EvalResult:
93
 
94
  def update_with_request_file(self, requests_path):
95
  """Finds the relevant request file for the current model and updates info with it"""
 
96
  request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
97
-
98
  try:
99
  with open(request_file, "r") as f:
100
  request = json.load(f)
@@ -107,9 +110,11 @@ class EvalResult:
107
  except Exception:
108
  print(f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}")
109
 
110
- def to_dict(self):
111
  """Converts the Eval Result to a dict compatible with our dataframe display"""
112
- average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
 
 
113
  data_dict = {
114
  "eval_name": self.eval_name, # not a column, just a save name,
115
  AutoEvalColumn.precision.name: self.precision.value.name,
@@ -138,6 +143,7 @@ def get_request_file_for_model(requests_path, model_name, precision):
138
  requests_path,
139
  f"{model_name}_eval_request_*.json",
140
  )
 
141
  request_files = glob.glob(request_files)
142
 
143
  # Select correct request file (precision)
@@ -146,6 +152,8 @@ def get_request_file_for_model(requests_path, model_name, precision):
146
  for tmp_request_file in request_files:
147
  with open(tmp_request_file, "r") as f:
148
  req_content = json.load(f)
 
 
149
  if (
150
  req_content["status"] in ["FINISHED"]
151
  and req_content["precision"] == precision.split(".")[-1]
@@ -186,9 +194,13 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
186
  eval_results[eval_name] = eval_result
187
 
188
  results = []
189
- for v in eval_results.values():
 
 
 
 
190
  try:
191
- v.to_dict() # we test if the dict version is complete
192
  results.append(v)
193
  except KeyError: # not all eval values present
194
  continue
 
60
  still_on_hub, _, model_config = is_model_on_hub(
61
  full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
62
  )
63
+ print("Is model on hub? \n", _)
64
  architecture = "?"
65
  if model_config is not None:
66
  architectures = getattr(model_config, "architectures", None)
 
71
  results = {}
72
  for task in Tasks:
73
  task = task.value
 
74
  # We average all scores of a given metric (not all metrics are present in all files)
75
  accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
76
  if accs.size == 0 or any([acc is None for acc in accs]):
77
  continue
78
+ if task.benchmark == "mRNA":
79
+ # Keep RMSE at original value
80
+ mean_acc = np.mean(accs)
81
+ else:
82
+ mean_acc = np.mean(accs) * 100.0
83
  results[task.benchmark] = mean_acc
84
 
85
  return self(
 
96
 
97
  def update_with_request_file(self, requests_path):
98
  """Finds the relevant request file for the current model and updates info with it"""
99
+ # print("Requests Path: ", requests_path)
100
  request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
 
101
  try:
102
  with open(request_file, "r") as f:
103
  request = json.load(f)
 
110
  except Exception:
111
  print(f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}")
112
 
113
+ def to_dict(self, rank):
114
  """Converts the Eval Result to a dict compatible with our dataframe display"""
115
+ average = rank
116
+ # average = sorted(average, reverse=True)
117
+ # rank = [rank+1 for rank, value in enumerate(average)]
118
  data_dict = {
119
  "eval_name": self.eval_name, # not a column, just a save name,
120
  AutoEvalColumn.precision.name: self.precision.value.name,
 
143
  requests_path,
144
  f"{model_name}_eval_request_*.json",
145
  )
146
+ # print("Request Files: ", request_files)
147
  request_files = glob.glob(request_files)
148
 
149
  # Select correct request file (precision)
 
152
  for tmp_request_file in request_files:
153
  with open(tmp_request_file, "r") as f:
154
  req_content = json.load(f)
155
+ # print("Request File: ", tmp_request_file)
156
+ # print("Req Content: ", req_content)
157
  if (
158
  req_content["status"] in ["FINISHED"]
159
  and req_content["precision"] == precision.split(".")[-1]
 
194
  eval_results[eval_name] = eval_result
195
 
196
  results = []
197
+ for result in eval_results.values():
198
+ result.average = np.mean(list(result.results.values()))
199
+ sorted_results = sorted(eval_results.values(), key=lambda r: r.average, reverse=True)
200
+
201
+ for i,v in enumerate(sorted_results):
202
  try:
203
+ v.to_dict(i) # we test if the dict version is complete
204
  results.append(v)
205
  except KeyError: # not all eval values present
206
  continue