Update read evals to read the correct information
#4
by
jcole1
- opened
src/leaderboard/read_evals.py
CHANGED
@@ -60,6 +60,7 @@ class EvalResult:
|
|
60 |
still_on_hub, _, model_config = is_model_on_hub(
|
61 |
full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
|
62 |
)
|
|
|
63 |
architecture = "?"
|
64 |
if model_config is not None:
|
65 |
architectures = getattr(model_config, "architectures", None)
|
@@ -70,13 +71,15 @@ class EvalResult:
|
|
70 |
results = {}
|
71 |
for task in Tasks:
|
72 |
task = task.value
|
73 |
-
|
74 |
# We average all scores of a given metric (not all metrics are present in all files)
|
75 |
accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
|
76 |
if accs.size == 0 or any([acc is None for acc in accs]):
|
77 |
continue
|
78 |
-
|
79 |
-
|
|
|
|
|
|
|
80 |
results[task.benchmark] = mean_acc
|
81 |
|
82 |
return self(
|
@@ -93,8 +96,8 @@ class EvalResult:
|
|
93 |
|
94 |
def update_with_request_file(self, requests_path):
|
95 |
"""Finds the relevant request file for the current model and updates info with it"""
|
|
|
96 |
request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
|
97 |
-
|
98 |
try:
|
99 |
with open(request_file, "r") as f:
|
100 |
request = json.load(f)
|
@@ -107,9 +110,11 @@ class EvalResult:
|
|
107 |
except Exception:
|
108 |
print(f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}")
|
109 |
|
110 |
-
def to_dict(self):
|
111 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
112 |
-
average =
|
|
|
|
|
113 |
data_dict = {
|
114 |
"eval_name": self.eval_name, # not a column, just a save name,
|
115 |
AutoEvalColumn.precision.name: self.precision.value.name,
|
@@ -138,6 +143,7 @@ def get_request_file_for_model(requests_path, model_name, precision):
|
|
138 |
requests_path,
|
139 |
f"{model_name}_eval_request_*.json",
|
140 |
)
|
|
|
141 |
request_files = glob.glob(request_files)
|
142 |
|
143 |
# Select correct request file (precision)
|
@@ -146,6 +152,8 @@ def get_request_file_for_model(requests_path, model_name, precision):
|
|
146 |
for tmp_request_file in request_files:
|
147 |
with open(tmp_request_file, "r") as f:
|
148 |
req_content = json.load(f)
|
|
|
|
|
149 |
if (
|
150 |
req_content["status"] in ["FINISHED"]
|
151 |
and req_content["precision"] == precision.split(".")[-1]
|
@@ -186,9 +194,13 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
|
|
186 |
eval_results[eval_name] = eval_result
|
187 |
|
188 |
results = []
|
189 |
-
for
|
|
|
|
|
|
|
|
|
190 |
try:
|
191 |
-
v.to_dict() # we test if the dict version is complete
|
192 |
results.append(v)
|
193 |
except KeyError: # not all eval values present
|
194 |
continue
|
|
|
60 |
still_on_hub, _, model_config = is_model_on_hub(
|
61 |
full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
|
62 |
)
|
63 |
+
print("Is model on hub? \n", _)
|
64 |
architecture = "?"
|
65 |
if model_config is not None:
|
66 |
architectures = getattr(model_config, "architectures", None)
|
|
|
71 |
results = {}
|
72 |
for task in Tasks:
|
73 |
task = task.value
|
|
|
74 |
# We average all scores of a given metric (not all metrics are present in all files)
|
75 |
accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
|
76 |
if accs.size == 0 or any([acc is None for acc in accs]):
|
77 |
continue
|
78 |
+
if task.benchmark == "mRNA":
|
79 |
+
# Keep RMSE at original value
|
80 |
+
mean_acc = np.mean(accs)
|
81 |
+
else:
|
82 |
+
mean_acc = np.mean(accs) * 100.0
|
83 |
results[task.benchmark] = mean_acc
|
84 |
|
85 |
return self(
|
|
|
96 |
|
97 |
def update_with_request_file(self, requests_path):
|
98 |
"""Finds the relevant request file for the current model and updates info with it"""
|
99 |
+
# print("Requests Path: ", requests_path)
|
100 |
request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
|
|
|
101 |
try:
|
102 |
with open(request_file, "r") as f:
|
103 |
request = json.load(f)
|
|
|
110 |
except Exception:
|
111 |
print(f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}")
|
112 |
|
113 |
+
def to_dict(self, rank):
|
114 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
115 |
+
average = rank
|
116 |
+
# average = sorted(average, reverse=True)
|
117 |
+
# rank = [rank+1 for rank, value in enumerate(average)]
|
118 |
data_dict = {
|
119 |
"eval_name": self.eval_name, # not a column, just a save name,
|
120 |
AutoEvalColumn.precision.name: self.precision.value.name,
|
|
|
143 |
requests_path,
|
144 |
f"{model_name}_eval_request_*.json",
|
145 |
)
|
146 |
+
# print("Request Files: ", request_files)
|
147 |
request_files = glob.glob(request_files)
|
148 |
|
149 |
# Select correct request file (precision)
|
|
|
152 |
for tmp_request_file in request_files:
|
153 |
with open(tmp_request_file, "r") as f:
|
154 |
req_content = json.load(f)
|
155 |
+
# print("Request File: ", tmp_request_file)
|
156 |
+
# print("Req Content: ", req_content)
|
157 |
if (
|
158 |
req_content["status"] in ["FINISHED"]
|
159 |
and req_content["precision"] == precision.split(".")[-1]
|
|
|
194 |
eval_results[eval_name] = eval_result
|
195 |
|
196 |
results = []
|
197 |
+
for result in eval_results.values():
|
198 |
+
result.average = np.mean(list(result.results.values()))
|
199 |
+
sorted_results = sorted(eval_results.values(), key=lambda r: r.average, reverse=True)
|
200 |
+
|
201 |
+
for i,v in enumerate(sorted_results):
|
202 |
try:
|
203 |
+
v.to_dict(i) # we test if the dict version is complete
|
204 |
results.append(v)
|
205 |
except KeyError: # not all eval values present
|
206 |
continue
|