open_pl_llm_leaderboard

Running on CPU Upgrade

djstrong commited on Mar 18

Commit

b5aa7e1

•

1 Parent(s): d488d58

change bele task

Files changed (3) hide show

app.py CHANGED Viewed

@@ -68,6 +68,10 @@ leaderboard_df = original_df.copy()
 ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
 def style_df(df: pd.DataFrame) -> Styler:
     leaderboard_df_styled = df.style.background_gradient(cmap="viridis")
     rounding = {'#Params (B)': "{:.1f}"}
     for task in Tasks:

 ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
 def style_df(df: pd.DataFrame) -> Styler:
+    # new_df = df.copy(deep=True)
+    # new_df['polish_poleval2018_task3_test_10k'] = -new_df['polish_poleval2018_task3_test_10k']
+    # new_df = new_df.to_frame()
     leaderboard_df_styled = df.style.background_gradient(cmap="viridis")
     rounding = {'#Params (B)': "{:.1f}"}
     for task in Tasks:

src/about.py CHANGED Viewed

@@ -13,14 +13,14 @@ class Task:
 # ---------------------------------------------------
 class Tasks(Enum):
     # task_key in the json file, metric_key in the json file, name to display in the leaderboard
-    task2 = Task("belebele_pol_Latn", "acc,none", "belebele_pol_Latn", "multiple_choice")
     task3 = Task("polemo2_in", "exact_match,score-first", "polemo2-in_g", "generate_until")
     task4 = Task("polemo2_in_multiple_choice", "acc,none", "polemo2-in_mc", "multiple_choice")
     task5 = Task("polemo2_out", "exact_match,score-first", "polemo2-out_g", "generate_until")
     task6 = Task("polemo2_out_multiple_choice", "acc,none", "polemo2-out_mc", "multiple_choice")
     task7 = Task("polish_8tags_multiple_choice", "acc,none", "8tags_mc", "multiple_choice")
     task8 = Task("polish_8tags_regex", "exact_match,score-first", "8tags_g", "generate_until")
-    #task9a = Task("polish_belebele_mc", "acc,none", "belebele_mc", "multiple_choice")
     task9 = Task("polish_belebele_regex", "exact_match,score-first", "belebele_g", "generate_until")
     task10 = Task("polish_dyk_multiple_choice", "f1,none", "dyk_mc", "multiple_choice")
     task11 = Task("polish_dyk_regex", "f1,score-first", "dyk_g", "generate_until")

 # ---------------------------------------------------
 class Tasks(Enum):
     # task_key in the json file, metric_key in the json file, name to display in the leaderboard
+    # task2 = Task("belebele_pol_Latn", "acc,none", "belebele_pol_Latn", "multiple_choice")
     task3 = Task("polemo2_in", "exact_match,score-first", "polemo2-in_g", "generate_until")
     task4 = Task("polemo2_in_multiple_choice", "acc,none", "polemo2-in_mc", "multiple_choice")
     task5 = Task("polemo2_out", "exact_match,score-first", "polemo2-out_g", "generate_until")
     task6 = Task("polemo2_out_multiple_choice", "acc,none", "polemo2-out_mc", "multiple_choice")
     task7 = Task("polish_8tags_multiple_choice", "acc,none", "8tags_mc", "multiple_choice")
     task8 = Task("polish_8tags_regex", "exact_match,score-first", "8tags_g", "generate_until")
+    task9a = Task("polish_belebele_mc", "acc,none", "belebele_mc", "multiple_choice")
     task9 = Task("polish_belebele_regex", "exact_match,score-first", "belebele_g", "generate_until")
     task10 = Task("polish_dyk_multiple_choice", "f1,none", "dyk_mc", "multiple_choice")
     task11 = Task("polish_dyk_regex", "f1,score-first", "dyk_g", "generate_until")

src/leaderboard/read_evals.py CHANGED Viewed

@@ -376,7 +376,10 @@ def get_raw_eval_results(results_path: str, requests_path: str, metadata) -> lis
     # print('missing_results_for_task', missing_results_for_task)
     for task, models in missing_results_for_task.items():
         print(f"Missing results for {task} for {len(models)} models")
-        print(" ".join(models))
     print(f"Missing metadata for {len(missing_metadata)} models")
     for model in missing_metadata:

     # print('missing_results_for_task', missing_results_for_task)
     for task, models in missing_results_for_task.items():
         print(f"Missing results for {task} for {len(models)} models")
+        # print(" ".join(models))
+        for model in models:
+            print(f'"{model}"')
+        print()
     print(f"Missing metadata for {len(missing_metadata)} models")
     for model in missing_metadata: