Update read evals to read the correct information
Browse filesPrevious read evals read in the wrong data with invalid naming information. We update this so that it works correctly with the right formatting.
src/leaderboard/read_evals.py
CHANGED
@@ -60,6 +60,7 @@ class EvalResult:
|
|
60 |
still_on_hub, _, model_config = is_model_on_hub(
|
61 |
full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
|
62 |
)
|
|
|
63 |
architecture = "?"
|
64 |
if model_config is not None:
|
65 |
architectures = getattr(model_config, "architectures", None)
|
@@ -70,13 +71,15 @@ class EvalResult:
|
|
70 |
results = {}
|
71 |
for task in Tasks:
|
72 |
task = task.value
|
73 |
-
|
74 |
# We average all scores of a given metric (not all metrics are present in all files)
|
75 |
accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
|
76 |
if accs.size == 0 or any([acc is None for acc in accs]):
|
77 |
continue
|
78 |
-
|
79 |
-
|
|
|
|
|
|
|
80 |
results[task.benchmark] = mean_acc
|
81 |
|
82 |
return self(
|
@@ -93,8 +96,8 @@ class EvalResult:
|
|
93 |
|
94 |
def update_with_request_file(self, requests_path):
|
95 |
"""Finds the relevant request file for the current model and updates info with it"""
|
|
|
96 |
request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
|
97 |
-
|
98 |
try:
|
99 |
with open(request_file, "r") as f:
|
100 |
request = json.load(f)
|
@@ -107,9 +110,11 @@ class EvalResult:
|
|
107 |
except Exception:
|
108 |
print(f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}")
|
109 |
|
110 |
-
def to_dict(self):
|
111 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
112 |
-
average =
|
|
|
|
|
113 |
data_dict = {
|
114 |
"eval_name": self.eval_name, # not a column, just a save name,
|
115 |
AutoEvalColumn.precision.name: self.precision.value.name,
|
@@ -138,6 +143,7 @@ def get_request_file_for_model(requests_path, model_name, precision):
|
|
138 |
requests_path,
|
139 |
f"{model_name}_eval_request_*.json",
|
140 |
)
|
|
|
141 |
request_files = glob.glob(request_files)
|
142 |
|
143 |
# Select correct request file (precision)
|
@@ -146,6 +152,8 @@ def get_request_file_for_model(requests_path, model_name, precision):
|
|
146 |
for tmp_request_file in request_files:
|
147 |
with open(tmp_request_file, "r") as f:
|
148 |
req_content = json.load(f)
|
|
|
|
|
149 |
if (
|
150 |
req_content["status"] in ["FINISHED"]
|
151 |
and req_content["precision"] == precision.split(".")[-1]
|
@@ -186,9 +194,13 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
|
|
186 |
eval_results[eval_name] = eval_result
|
187 |
|
188 |
results = []
|
189 |
-
for
|
|
|
|
|
|
|
|
|
190 |
try:
|
191 |
-
v.to_dict() # we test if the dict version is complete
|
192 |
results.append(v)
|
193 |
except KeyError: # not all eval values present
|
194 |
continue
|
|
|
60 |
still_on_hub, _, model_config = is_model_on_hub(
|
61 |
full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
|
62 |
)
|
63 |
+
print("Is model on hub? \n", _)
|
64 |
architecture = "?"
|
65 |
if model_config is not None:
|
66 |
architectures = getattr(model_config, "architectures", None)
|
|
|
71 |
results = {}
|
72 |
for task in Tasks:
|
73 |
task = task.value
|
|
|
74 |
# We average all scores of a given metric (not all metrics are present in all files)
|
75 |
accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
|
76 |
if accs.size == 0 or any([acc is None for acc in accs]):
|
77 |
continue
|
78 |
+
if task.benchmark == "mRNA":
|
79 |
+
# Keep RMSE at original value
|
80 |
+
mean_acc = np.mean(accs)
|
81 |
+
else:
|
82 |
+
mean_acc = np.mean(accs) * 100.0
|
83 |
results[task.benchmark] = mean_acc
|
84 |
|
85 |
return self(
|
|
|
96 |
|
97 |
def update_with_request_file(self, requests_path):
|
98 |
"""Finds the relevant request file for the current model and updates info with it"""
|
99 |
+
# print("Requests Path: ", requests_path)
|
100 |
request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
|
|
|
101 |
try:
|
102 |
with open(request_file, "r") as f:
|
103 |
request = json.load(f)
|
|
|
110 |
except Exception:
|
111 |
print(f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}")
|
112 |
|
113 |
+
def to_dict(self, rank):
|
114 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
115 |
+
average = rank
|
116 |
+
# average = sorted(average, reverse=True)
|
117 |
+
# rank = [rank+1 for rank, value in enumerate(average)]
|
118 |
data_dict = {
|
119 |
"eval_name": self.eval_name, # not a column, just a save name,
|
120 |
AutoEvalColumn.precision.name: self.precision.value.name,
|
|
|
143 |
requests_path,
|
144 |
f"{model_name}_eval_request_*.json",
|
145 |
)
|
146 |
+
# print("Request Files: ", request_files)
|
147 |
request_files = glob.glob(request_files)
|
148 |
|
149 |
# Select correct request file (precision)
|
|
|
152 |
for tmp_request_file in request_files:
|
153 |
with open(tmp_request_file, "r") as f:
|
154 |
req_content = json.load(f)
|
155 |
+
# print("Request File: ", tmp_request_file)
|
156 |
+
# print("Req Content: ", req_content)
|
157 |
if (
|
158 |
req_content["status"] in ["FINISHED"]
|
159 |
and req_content["precision"] == precision.split(".")[-1]
|
|
|
194 |
eval_results[eval_name] = eval_result
|
195 |
|
196 |
results = []
|
197 |
+
for result in eval_results.values():
|
198 |
+
result.average = np.mean(list(result.results.values()))
|
199 |
+
sorted_results = sorted(eval_results.values(), key=lambda r: r.average, reverse=True)
|
200 |
+
|
201 |
+
for i,v in enumerate(sorted_results):
|
202 |
try:
|
203 |
+
v.to_dict(i) # we test if the dict version is complete
|
204 |
results.append(v)
|
205 |
except KeyError: # not all eval values present
|
206 |
continue
|