Spaces:

yangheng
/

OmniGenomeLeaderboard

Running

App Files Files Community

jcole1 commited on Sep 5

Commit

0520d15

•

1 Parent(s): f02c8f8

Update read evals to read the correct information

Browse files

Previous read evals read in the wrong data with invalid naming information. We update this so that it works correctly with the right formatting.

Files changed (1) hide show

src/leaderboard/read_evals.py +20 -8

src/leaderboard/read_evals.py CHANGED Viewed

@@ -60,6 +60,7 @@ class EvalResult:
         still_on_hub, _, model_config = is_model_on_hub(
             full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
         )
         architecture = "?"
         if model_config is not None:
             architectures = getattr(model_config, "architectures", None)
@@ -70,13 +71,15 @@ class EvalResult:
         results = {}
         for task in Tasks:
             task = task.value
             # We average all scores of a given metric (not all metrics are present in all files)
             accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
             if accs.size == 0 or any([acc is None for acc in accs]):
                 continue
-            mean_acc = np.mean(accs) * 100.0
             results[task.benchmark] = mean_acc
         return self(
@@ -93,8 +96,8 @@ class EvalResult:
     def update_with_request_file(self, requests_path):
         """Finds the relevant request file for the current model and updates info with it"""
         request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
         try:
             with open(request_file, "r") as f:
                 request = json.load(f)
@@ -107,9 +110,11 @@ class EvalResult:
         except Exception:
             print(f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}")
-    def to_dict(self):
         """Converts the Eval Result to a dict compatible with our dataframe display"""
-        average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
         data_dict = {
             "eval_name": self.eval_name,  # not a column, just a save name,
             AutoEvalColumn.precision.name: self.precision.value.name,
@@ -138,6 +143,7 @@ def get_request_file_for_model(requests_path, model_name, precision):
         requests_path,
         f"{model_name}_eval_request_*.json",
     )
     request_files = glob.glob(request_files)
     # Select correct request file (precision)
@@ -146,6 +152,8 @@ def get_request_file_for_model(requests_path, model_name, precision):
     for tmp_request_file in request_files:
         with open(tmp_request_file, "r") as f:
             req_content = json.load(f)
             if (
                 req_content["status"] in ["FINISHED"]
                 and req_content["precision"] == precision.split(".")[-1]
@@ -186,9 +194,13 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
             eval_results[eval_name] = eval_result
     results = []
-    for v in eval_results.values():
         try:
-            v.to_dict() # we test if the dict version is complete
             results.append(v)
         except KeyError:  # not all eval values present
             continue

         still_on_hub, _, model_config = is_model_on_hub(
             full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
         )
+        print("Is model on hub? \n", _)
         architecture = "?"
         if model_config is not None:
             architectures = getattr(model_config, "architectures", None)
         results = {}
         for task in Tasks:
             task = task.value
             # We average all scores of a given metric (not all metrics are present in all files)
             accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
             if accs.size == 0 or any([acc is None for acc in accs]):
                 continue
+            if task.benchmark == "mRNA":
+                # Keep RMSE at original value
+                mean_acc = np.mean(accs)
+            else:
+                mean_acc = np.mean(accs) * 100.0
             results[task.benchmark] = mean_acc
         return self(
     def update_with_request_file(self, requests_path):
         """Finds the relevant request file for the current model and updates info with it"""
+        # print("Requests Path: ", requests_path)
         request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
         try:
             with open(request_file, "r") as f:
                 request = json.load(f)
         except Exception:
             print(f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}")
+    def to_dict(self, rank):
         """Converts the Eval Result to a dict compatible with our dataframe display"""
+        average = rank
+        # average = sorted(average, reverse=True)
+        # rank = [rank+1 for rank, value in enumerate(average)]
         data_dict = {
             "eval_name": self.eval_name,  # not a column, just a save name,
             AutoEvalColumn.precision.name: self.precision.value.name,
         requests_path,
         f"{model_name}_eval_request_*.json",
     )
+    # print("Request Files: ", request_files)
     request_files = glob.glob(request_files)
     # Select correct request file (precision)
     for tmp_request_file in request_files:
         with open(tmp_request_file, "r") as f:
             req_content = json.load(f)
+            # print("Request File: ", tmp_request_file)
+            # print("Req Content: ", req_content)
             if (
                 req_content["status"] in ["FINISHED"]
                 and req_content["precision"] == precision.split(".")[-1]
             eval_results[eval_name] = eval_result
     results = []
+    for result in eval_results.values():
+        result.average = np.mean(list(result.results.values()))
+    sorted_results = sorted(eval_results.values(), key=lambda r: r.average, reverse=True)
+    for i,v in enumerate(sorted_results):
         try:
+            v.to_dict(i) # we test if the dict version is complete
             results.append(v)
         except KeyError:  # not all eval values present
             continue