open_pt_llm_leaderboard

Running on CPU Upgrade

eduagarcia commited on Feb 17

Commit

6269bd0

•

1 Parent(s): 1a3f05a

Allow old model metrics

Files changed (2) hide show

src/display/changelog.py CHANGED Viewed

@@ -1,6 +1,10 @@
 CHANGELOG_TEXT = f"""
 # Changes made to the leaderboard
-### [0.1.0] - 2024-02-01
 Protype version launched with 7 benchmarks ENEM, BLUEX, OAB Exams, ASSIN 2 RTE and STS, FAQUAD NLI and SPARROW POR
 """

 CHANGELOG_TEXT = f"""
 # Changes made to the leaderboard
+### [1.1.0] - 2024-02-16
+Removed the Sparrow POR benchmark from the leaderboard because of low quality annotations
+Added HateBR Offensive, PT Hate Speech and tweetSentBR benchmarks to the leaderboard, started new evaluation queue for these benchmarks
+### [1.0.0] - 2024-02-01
 Protype version launched with 7 benchmarks ENEM, BLUEX, OAB Exams, ASSIN 2 RTE and STS, FAQUAD NLI and SPARROW POR
 """

src/leaderboard/read_evals.py CHANGED Viewed

@@ -216,7 +216,7 @@ def get_request_file_for_model(requests_path, model_name, precision):
         with open(tmp_request_file, "r") as f:
             req_content = json.load(f)
             if (
-                req_content["status"] in ["FINISHED"]
                 and req_content["precision"] == precision.split(".")[-1]
             ):
                 request_file = tmp_request_file
@@ -262,7 +262,7 @@ def get_raw_eval_results(results_path: str, requests_path: str, dynamic_path: st
     results = []
     for v in eval_results.values():
         try:
-            if v.status == "FINISHED" and not v.hidden:
                 v.to_dict() # we test if the dict version is complete
                 results.append(v)
         except KeyError as e:  # not all eval values present

         with open(tmp_request_file, "r") as f:
             req_content = json.load(f)
             if (
+                req_content["status"] in ["FINISHED", "PENDING_NEW_EVAL"]
                 and req_content["precision"] == precision.split(".")[-1]
             ):
                 request_file = tmp_request_file
     results = []
     for v in eval_results.values():
         try:
+            if v.status in ["FINISHED", "PENDING_NEW_EVAL"] and not v.hidden:
                 v.to_dict() # we test if the dict version is complete
                 results.append(v)
         except KeyError as e:  # not all eval values present