open_pl_llm_leaderboard

Restarting on CPU Upgrade

App Files Files Community

Linker1907 commited on Jul 13, 2023

Commit

d16cee2

•

1 Parent(s): e868f35

Using the new backend

Browse files

Files changed (6) hide show

README.md +1 -0
app.py +52 -27
src/assets/text_content.py +35 -5
src/auto_leaderboard/load_results.py +44 -41
src/init.py +21 -10
src/utils_display.py +6 -6

README.md CHANGED Viewed

@@ -8,6 +8,7 @@ sdk_version: 3.27.0
 app_file: app.py
 pinned: true
 license: apache-2.0
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 app_file: app.py
 pinned: true
 license: apache-2.0
+duplicated_from: HuggingFaceH4/open_llm_leaderboard
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py CHANGED Viewed

@@ -15,26 +15,40 @@ from src.assets.text_content import *
 from src.auto_leaderboard.load_results import get_eval_results_dicts, make_clickable_model
 from src.assets.hardcoded_evals import gpt4_values, gpt35_values, baseline
 from src.assets.css_html_js import custom_css, get_window_url_params
-from src.utils_display import AutoEvalColumn, EvalQueueColumn, EloEvalColumn, fields, styled_error, styled_warning, styled_message
-from src.init import load_all_info_from_hub
 # clone / pull the lmeh eval data
 H4_TOKEN = os.environ.get("H4_TOKEN", None)
-LMEH_REPO = "HuggingFaceH4/lmeh_evaluations"
 IS_PUBLIC = bool(os.environ.get("IS_PUBLIC", True))
 ADD_PLOTS = False
-EVAL_REQUESTS_PATH = "auto_evals/eval_requests"
-api = HfApi()
 def restart_space():
     api.restart_space(
         repo_id="HuggingFaceH4/open_llm_leaderboard", token=H4_TOKEN
     )
-auto_eval_repo, requested_models = load_all_info_from_hub(LMEH_REPO)
 COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
 TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
@@ -60,9 +74,12 @@ def has_nan_values(df, columns):
 def get_leaderboard_df():
-    if auto_eval_repo:
         print("Pulling evaluation results for the leaderboard.")
-        auto_eval_repo.git_pull()
     all_data = get_eval_results_dicts(IS_PUBLIC)
@@ -84,9 +101,12 @@ def get_leaderboard_df():
 def get_evaluation_queue_df():
     # todo @saylortwift: replace the repo by the one you created for the eval queue
-    if auto_eval_repo:
         print("Pulling changes for the evaluation queue.")
-        auto_eval_repo.git_pull()
     entries = [
         entry
@@ -106,7 +126,7 @@ def get_evaluation_queue_df():
             data["revision"] = data.get("revision", "main")
             all_evals.append(data)
-        else:
             # this is a folder
             sub_entries = [
                 e
@@ -124,10 +144,10 @@ def get_evaluation_queue_df():
     pending_list = [e for e in all_evals if e["status"] == "PENDING"]
     running_list = [e for e in all_evals if e["status"] == "RUNNING"]
-    finished_list = [e for e in all_evals if e["status"] == "FINISHED"]
-    df_pending = pd.DataFrame.from_records(pending_list)
-    df_running = pd.DataFrame.from_records(running_list)
-    df_finished = pd.DataFrame.from_records(finished_list)
     return df_finished[EVAL_COLS], df_running[EVAL_COLS], df_pending[EVAL_COLS]
@@ -149,7 +169,7 @@ def is_model_on_hub(model_name, revision) -> bool:
         return False, "needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard."
     except Exception as e:
-        print("Could not get the model config from the hub.: \n", e)
         return False, "was not found on hub!"
@@ -200,7 +220,7 @@ def add_new_eval(
     out_path = f"{OUT_DIR}/{model_path}_eval_request_{private}_{is_8_bit_eval}_{is_delta_weight}.json"
     # Check for duplicate submission
-    if out_path.split("eval_requests/")[1].lower() in requested_models:
         return styled_warning("This model has been already submitted.")
     with open(out_path, "w") as f:
@@ -208,13 +228,17 @@ def add_new_eval(
     api.upload_file(
         path_or_fileobj=out_path,
-        path_in_repo=out_path,
-        repo_id=LMEH_REPO,
         token=H4_TOKEN,
         repo_type="dataset",
     )
-    return styled_message("Your request has been submitted to the evaluation queue!")
 def refresh():
@@ -310,13 +334,6 @@ with demo:
             )
         with gr.TabItem("About", elem_id="llm-benchmark-tab-table", id=2):
             gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
-            with gr.Accordion("📙 Citation", open=False):
-                citation_button = gr.Textbox(
-                    value=CITATION_BUTTON_TEXT,
-                    label=CITATION_BUTTON_LABEL,
-                    elem_id="citation-button",
-                ).style(show_copy_button=True)
     with gr.Column():
         with gr.Row():
@@ -396,6 +413,14 @@ with demo:
                 submission_result,
             )
     dummy = gr.Textbox(visible=False)
     demo.load(
         change_tab,

 from src.auto_leaderboard.load_results import get_eval_results_dicts, make_clickable_model
 from src.assets.hardcoded_evals import gpt4_values, gpt35_values, baseline
 from src.assets.css_html_js import custom_css, get_window_url_params
+from src.utils_display import AutoEvalColumn, EvalQueueColumn, fields, styled_error, styled_warning, styled_message
+from src.init import get_all_requested_models, load_all_info_from_hub
 # clone / pull the lmeh eval data
 H4_TOKEN = os.environ.get("H4_TOKEN", None)
+QUEUE_REPO = "open-llm-leaderboard/requests"
+RESULTS_REPO = "open-llm-leaderboard/results"
+PRIVATE_QUEUE_REPO = "open-llm-leaderboard/private-requests"
+PRIVATE_RESULTS_REPO = "open-llm-leaderboard/private-results"
 IS_PUBLIC = bool(os.environ.get("IS_PUBLIC", True))
 ADD_PLOTS = False
+EVAL_REQUESTS_PATH = "eval-queue"
+EVAL_RESULTS_PATH = "eval-results"
+EVAL_REQUESTS_PATH_PRIVATE = "eval-queue-private"
+EVAL_RESULTS_PATH_PRIVATE = "eval-results-private"
+api = HfApi()
 def restart_space():
     api.restart_space(
         repo_id="HuggingFaceH4/open_llm_leaderboard", token=H4_TOKEN
     )
+eval_queue, requested_models, eval_results = load_all_info_from_hub(QUEUE_REPO, RESULTS_REPO, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH)
+if not IS_PUBLIC:
+    eval_queue_private, requested_models_private, eval_results_private = load_all_info_from_hub(PRIVATE_QUEUE_REPO, PRIVATE_RESULTS_REPO, EVAL_REQUESTS_PATH_PRIVATE, EVAL_RESULTS_PATH_PRIVATE)
+else:
+    eval_queue_private, eval_results_private = None, None
 COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
 TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
 def get_leaderboard_df():
+    if eval_results:
         print("Pulling evaluation results for the leaderboard.")
+        eval_results.git_pull()
+    if eval_results_private:
+        print("Pulling evaluation results for the leaderboard.")
+        eval_results_private.git_pull()
     all_data = get_eval_results_dicts(IS_PUBLIC)
 def get_evaluation_queue_df():
     # todo @saylortwift: replace the repo by the one you created for the eval queue
+    if eval_queue:
+        print("Pulling changes for the evaluation queue.")
+        eval_queue.git_pull()
+    if eval_queue_private:
         print("Pulling changes for the evaluation queue.")
+        eval_queue_private.git_pull()
     entries = [
         entry
             data["revision"] = data.get("revision", "main")
             all_evals.append(data)
+        elif ".md" not in entry:
             # this is a folder
             sub_entries = [
                 e
     pending_list = [e for e in all_evals if e["status"] == "PENDING"]
     running_list = [e for e in all_evals if e["status"] == "RUNNING"]
+    finished_list = [e for e in all_evals if e["status"].startswith("FINISHED")]
+    df_pending = pd.DataFrame.from_records(pending_list, columns=EVAL_COLS)
+    df_running = pd.DataFrame.from_records(running_list, columns=EVAL_COLS)
+    df_finished = pd.DataFrame.from_records(finished_list, columns=EVAL_COLS)
     return df_finished[EVAL_COLS], df_running[EVAL_COLS], df_pending[EVAL_COLS]
         return False, "needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard."
     except Exception as e:
+        print(f"Could not get the model config from the hub.: {e}")
         return False, "was not found on hub!"
     out_path = f"{OUT_DIR}/{model_path}_eval_request_{private}_{is_8_bit_eval}_{is_delta_weight}.json"
     # Check for duplicate submission
+    if out_path.split("eval-queue/")[1].lower() in requested_models:
         return styled_warning("This model has been already submitted.")
     with open(out_path, "w") as f:
     api.upload_file(
         path_or_fileobj=out_path,
+        path_in_repo=out_path.split("eval-queue/")[1],
+        repo_id=QUEUE_REPO,
         token=H4_TOKEN,
         repo_type="dataset",
+        commit_message=f"Add {model} to eval queue",
     )
+    # remove the local file
+    os.remove(out_path)
+    return styled_message("Your request has been submitted to the evaluation queue!\nPlease wait for up to an hour for the model to show in the PENDING list.")
 def refresh():
             )
         with gr.TabItem("About", elem_id="llm-benchmark-tab-table", id=2):
             gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
     with gr.Column():
         with gr.Row():
                 submission_result,
             )
+    with gr.Row():
+        with gr.Accordion("📙 Citation", open=False):
+            citation_button = gr.Textbox(
+                value=CITATION_BUTTON_TEXT,
+                label=CITATION_BUTTON_LABEL,
+                elem_id="citation-button",
+            ).style(show_copy_button=True)
     dummy = gr.Textbox(visible=False)
     demo.load(
         change_tab,

src/assets/text_content.py CHANGED Viewed

@@ -61,7 +61,7 @@ INTRODUCTION_TEXT = f"""
 🤗 Anyone from the community can submit a model for automated evaluation on the 🤗 GPU cluster, as long as it is a 🤗 Transformers model with weights on the Hub. We also support evaluation of models with delta-weights for non-commercial licensed models, such as LLaMa.
-Other cool benchmarks for LLMs are developped at HuggingFace, go check them out: 🙋🤖 [human and GPT4 evals](https://huggingface.co/spaces/HuggingFaceH4/human_eval_llm_leaderboard), 🖥️ [performance bencmarks](https://huggingface.co/spaces/optimum/llm-perf-leaderboard)
 """
 LLM_BENCHMARKS_TEXT = f"""
@@ -78,6 +78,29 @@ With the plethora of large language models (LLMs) and chatbots being released we
 We chose these benchmarks as they test a variety of reasoning and general knowledge across a wide variety of fields in 0-shot and few-shot settings.
 # Reproduction
 To reproduce our results, here is the commands you can run, using [this version](https://github.com/EleutherAI/lm-evaluation-harness/tree/e47e01beea79cfe87421e2dac49e64d499c240b4) of the Eleuther AI Harness:
 `python main.py --model=hf-causal --model_args="pretrained=<your_model>,use_accelerate=True,revision=<your_model_revision>"`
@@ -87,10 +110,17 @@ The total batch size we get for models which fit on one A100 node is 16 (8 GPUs
 *You can expect results to vary slightly for different batch sizes because of padding.*
 The tasks and few shots parameters are:
-- ARC: 25-shot, *arc-challenge*
-- HellaSwag: 10-shot, *hellaswag*
-- TruthfulQA: 0-shot, *truthfulqa-mc* (mc2 score)
-- MMLU: 5-shot, *hendrycksTest-abstract_algebra,hendrycksTest-anatomy,hendrycksTest-astronomy,hendrycksTest-business_ethics,hendrycksTest-clinical_knowledge,hendrycksTest-college_biology,hendrycksTest-college_chemistry,hendrycksTest-college_computer_science,hendrycksTest-college_mathematics,hendrycksTest-college_medicine,hendrycksTest-college_physics,hendrycksTest-computer_security,hendrycksTest-conceptual_physics,hendrycksTest-econometrics,hendrycksTest-electrical_engineering,hendrycksTest-elementary_mathematics,hendrycksTest-formal_logic,hendrycksTest-global_facts,hendrycksTest-high_school_biology,hendrycksTest-high_school_chemistry,hendrycksTest-high_school_computer_science,hendrycksTest-high_school_european_history,hendrycksTest-high_school_geography,hendrycksTest-high_school_government_and_politics,hendrycksTest-high_school_macroeconomics,hendrycksTest-high_school_mathematics,hendrycksTest-high_school_microeconomics,hendrycksTest-high_school_physics,hendrycksTest-high_school_psychology,hendrycksTest-high_school_statistics,hendrycksTest-high_school_us_history,hendrycksTest-high_school_world_history,hendrycksTest-human_aging,hendrycksTest-human_sexuality,hendrycksTest-international_law,hendrycksTest-jurisprudence,hendrycksTest-logical_fallacies,hendrycksTest-machine_learning,hendrycksTest-management,hendrycksTest-marketing,hendrycksTest-medical_genetics,hendrycksTest-miscellaneous,hendrycksTest-moral_disputes,hendrycksTest-moral_scenarios,hendrycksTest-nutrition,hendrycksTest-philosophy,hendrycksTest-prehistory,hendrycksTest-professional_accounting,hendrycksTest-professional_law,hendrycksTest-professional_medicine,hendrycksTest-professional_psychology,hendrycksTest-public_relations,hendrycksTest-security_studies,hendrycksTest-sociology,hendrycksTest-us_foreign_policy,hendrycksTest-virology,hendrycksTest-world_religions*
 """
 EVALUATION_QUEUE_TEXT = f"""

 🤗 Anyone from the community can submit a model for automated evaluation on the 🤗 GPU cluster, as long as it is a 🤗 Transformers model with weights on the Hub. We also support evaluation of models with delta-weights for non-commercial licensed models, such as LLaMa.
+Other cool benchmarks for LLMs are developped at HuggingFace, go check them out: 🙋🤖 [human and GPT4 evals](https://huggingface.co/spaces/HuggingFaceH4/human_eval_llm_leaderboard), 🖥️ [performance benchmarks](https://huggingface.co/spaces/optimum/llm-perf-leaderboard)
 """
 LLM_BENCHMARKS_TEXT = f"""
 We chose these benchmarks as they test a variety of reasoning and general knowledge across a wide variety of fields in 0-shot and few-shot settings.
+# Some good practices before submitting a model
+## 1) Make sure you can load your model and tokenizer using AutoClasses:
+```python
+from transformers import AutoConfig, AutoModel, AutoTokenizer
+config = AutoConfig.from_pretrained("your model name", revision=revision)
+model = AutoModel.from_pretrained("your model name", revision=revision)
+tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
+```
+If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded.
+Note: make sure your model is public!
+Note: if your model needs `use_remote_code=True`, we do not support this option yet but we are working on adding it, stay posted!
+## 2) Convert your model weights to [safetensors](https://huggingface.co/docs/safetensors/index)
+It's a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of weights of your model to the `Extended Viewer`!
+## 3) Make sure your model has an open license!
+This is a leaderboard for Open LLMs, and we'd love for as many people as possible to know they can use your model 🤗
+## 4) Fill up your model card
+When we add extra information about models to the leaderboard, it will be automatically taken from the model card
 # Reproduction
 To reproduce our results, here is the commands you can run, using [this version](https://github.com/EleutherAI/lm-evaluation-harness/tree/e47e01beea79cfe87421e2dac49e64d499c240b4) of the Eleuther AI Harness:
 `python main.py --model=hf-causal --model_args="pretrained=<your_model>,use_accelerate=True,revision=<your_model_revision>"`
 *You can expect results to vary slightly for different batch sizes because of padding.*
 The tasks and few shots parameters are:
+- ARC: 25-shot, *arc-challenge* (`acc_norm`)
+- HellaSwag: 10-shot, *hellaswag* (`acc_norm`)
+- TruthfulQA: 0-shot, *truthfulqa-mc* (`mc2`)
+- MMLU: 5-shot, *hendrycksTest-abstract_algebra,hendrycksTest-anatomy,hendrycksTest-astronomy,hendrycksTest-business_ethics,hendrycksTest-clinical_knowledge,hendrycksTest-college_biology,hendrycksTest-college_chemistry,hendrycksTest-college_computer_science,hendrycksTest-college_mathematics,hendrycksTest-college_medicine,hendrycksTest-college_physics,hendrycksTest-computer_security,hendrycksTest-conceptual_physics,hendrycksTest-econometrics,hendrycksTest-electrical_engineering,hendrycksTest-elementary_mathematics,hendrycksTest-formal_logic,hendrycksTest-global_facts,hendrycksTest-high_school_biology,hendrycksTest-high_school_chemistry,hendrycksTest-high_school_computer_science,hendrycksTest-high_school_european_history,hendrycksTest-high_school_geography,hendrycksTest-high_school_government_and_politics,hendrycksTest-high_school_macroeconomics,hendrycksTest-high_school_mathematics,hendrycksTest-high_school_microeconomics,hendrycksTest-high_school_physics,hendrycksTest-high_school_psychology,hendrycksTest-high_school_statistics,hendrycksTest-high_school_us_history,hendrycksTest-high_school_world_history,hendrycksTest-human_aging,hendrycksTest-human_sexuality,hendrycksTest-international_law,hendrycksTest-jurisprudence,hendrycksTest-logical_fallacies,hendrycksTest-machine_learning,hendrycksTest-management,hendrycksTest-marketing,hendrycksTest-medical_genetics,hendrycksTest-miscellaneous,hendrycksTest-moral_disputes,hendrycksTest-moral_scenarios,hendrycksTest-nutrition,hendrycksTest-philosophy,hendrycksTest-prehistory,hendrycksTest-professional_accounting,hendrycksTest-professional_law,hendrycksTest-professional_medicine,hendrycksTest-professional_psychology,hendrycksTest-public_relations,hendrycksTest-security_studies,hendrycksTest-sociology,hendrycksTest-us_foreign_policy,hendrycksTest-virology,hendrycksTest-world_religions* (`acc` of `all`)
+# In case of model failure
+If your model is displayed in the `FAILED` category, its execution stopped.
+Make sure you have followed the above steps first.
+If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
 """
 EVALUATION_QUEUE_TEXT = f"""

src/auto_leaderboard/load_results.py CHANGED Viewed

@@ -7,14 +7,13 @@ from typing import Dict, List, Tuple
 from src.utils_display import AutoEvalColumn, make_clickable_model
 import numpy as np
-# clone / pull the lmeh eval data
-METRICS = ["acc_norm", "acc_norm", "acc_norm", "mc2"]
-BENCHMARKS = ["arc_challenge", "hellaswag", "hendrycks", "truthfulqa_mc"]
 BENCH_TO_NAME = {
-    "arc_challenge": AutoEvalColumn.arc.name,
     "hellaswag": AutoEvalColumn.hellaswag.name,
-    "hendrycks": AutoEvalColumn.mmlu.name,
-    "truthfulqa_mc": AutoEvalColumn.truthfulqa.name,
 }
@@ -24,8 +23,8 @@ class EvalResult:
     org: str
     model: str
     revision: str
-    is_8bit: bool
     results: dict
     def to_dict(self):
         if self.org is not None:
@@ -44,7 +43,7 @@ class EvalResult:
         )
         for benchmark in BENCHMARKS:
-            if not benchmark in self.results.keys():
                 self.results[benchmark] = None
         for k, v in BENCH_TO_NAME.items():
@@ -53,57 +52,61 @@ class EvalResult:
         return data_dict
-def parse_eval_result(json_filepath: str) -> Tuple[str, dict]:
     with open(json_filepath) as fp:
         data = json.load(fp)
-    path_split = json_filepath.split("/")
-    org = None
-    model = path_split[-4]
-    is_8bit = path_split[-2] == "8bit"
-    revision = path_split[-3]
-    if len(path_split) == 7:
-        # handles gpt2 type models that don't have an org
-        result_key = f"{model}_{revision}_{is_8bit}"
     else:
-        org = path_split[-5]
-        result_key =  f"{org}_{model}_{revision}_{is_8bit}"
-    eval_result = None
     for benchmark, metric in zip(BENCHMARKS, METRICS):
-        if benchmark in json_filepath:
-            accs = np.array([v[metric] for v in data["results"].values()])
-            mean_acc = round(np.mean(accs) * 100.0, 1)
-            eval_result = EvalResult(
-                result_key, org, model, revision, is_8bit, {benchmark: mean_acc}
-            )
-    return result_key, eval_result
 def get_eval_results(is_public) -> List[EvalResult]:
     json_filepaths = glob.glob(
-        "auto_evals/eval_results/public/**/16bit/*.json", recursive=True
     )
     if not is_public:
         json_filepaths += glob.glob(
-            "auto_evals/eval_results/private/**/*.json", recursive=True
-        )
-        json_filepaths += glob.glob(
-            "auto_evals/eval_results/private/**/*.json", recursive=True
         )
-        # include the 8bit evals of public models
-        json_filepaths += glob.glob(
-            "auto_evals/eval_results/public/**/8bit/*.json", recursive=True
-        )
     eval_results = {}
     for json_filepath in json_filepaths:
-        result_key, eval_result = parse_eval_result(json_filepath)
-        if result_key in eval_results.keys():
-            eval_results[result_key].results.update(eval_result.results)
-        else:
-            eval_results[result_key] = eval_result
     eval_results = [v for v in eval_results.values()]

 from src.utils_display import AutoEvalColumn, make_clickable_model
 import numpy as np
+METRICS = ["acc_norm", "acc_norm", "acc", "mc2"]
+BENCHMARKS = ["arc:challenge", "hellaswag", "hendrycksTest", "truthfulqa:mc"]
 BENCH_TO_NAME = {
+    "arc:challenge": AutoEvalColumn.arc.name,
     "hellaswag": AutoEvalColumn.hellaswag.name,
+    "hendrycksTest": AutoEvalColumn.mmlu.name,
+    "truthfulqa:mc": AutoEvalColumn.truthfulqa.name,
 }
     org: str
     model: str
     revision: str
     results: dict
+    is_8bit: bool = False
     def to_dict(self):
         if self.org is not None:
         )
         for benchmark in BENCHMARKS:
+            if benchmark not in self.results.keys():
                 self.results[benchmark] = None
         for k, v in BENCH_TO_NAME.items():
         return data_dict
+def parse_eval_result(json_filepath: str) -> Tuple[str, list[dict]]:
     with open(json_filepath) as fp:
         data = json.load(fp)
+    config = data["config"]
+    model = config.get("model_name", None)
+    if model is None:
+        model = config.get("model_args", None)
+    model_sha = config.get("model_sha", "")
+    eval_sha = config.get("lighteval_sha", "")
+    model_split = model.split("/", 1)
+    model = model_split[-1]
+    if len(model_split) == 1:
+        org = None
+        model = model_split[0]
+        result_key = f"{model}_{model_sha}_{eval_sha}"
     else:
+        org = model_split[0]
+        model = model_split[1]
+        result_key =  f"{org}_{model}_{model_sha}_{eval_sha}"
+    eval_results = []
     for benchmark, metric in zip(BENCHMARKS, METRICS):
+        accs = np.array([v[metric] for k, v in data["results"].items() if benchmark in k])
+        if accs.size == 0:
+            continue
+        mean_acc = round(np.mean(accs) * 100.0, 1)
+        eval_results.append(EvalResult(
+            result_key, org, model, model_sha, {benchmark: mean_acc}
+        ))
+    return result_key, eval_results
 def get_eval_results(is_public) -> List[EvalResult]:
     json_filepaths = glob.glob(
+        "eval-results/**/results*.json", recursive=True
     )
     if not is_public:
         json_filepaths += glob.glob(
+            "private-eval-results/**/results*.json", recursive=True
         )
     eval_results = {}
     for json_filepath in json_filepaths:
+        result_key, results = parse_eval_result(json_filepath)
+        for eval_result in results:
+            if result_key in eval_results.keys():
+                eval_results[result_key].results.update(eval_result.results)
+            else:
+                eval_results[result_key] = eval_result
     eval_results = [v for v in eval_results.values()]

src/init.py CHANGED Viewed

@@ -13,26 +13,37 @@ def get_all_requested_models(requested_models_dir):
         if current_depth == depth:
             file_names.extend([os.path.join(root, file) for file in files])
-    return set([file_name.lower().split("eval_requests/")[1] for file_name in file_names])
-def load_all_info_from_hub(LMEH_REPO):
-    auto_eval_repo = None
     requested_models = None
     if H4_TOKEN:
         print("Pulling evaluation requests and results.")
-        auto_eval_repo = Repository(
-            local_dir="./auto_evals/",
-            clone_from=LMEH_REPO,
             use_auth_token=H4_TOKEN,
             repo_type="dataset",
         )
-        auto_eval_repo.git_pull()
-        requested_models_dir = "./auto_evals/eval_requests"
-        requested_models = get_all_requested_models(requested_models_dir)
-    return auto_eval_repo, requested_models
 #def load_results(model, benchmark, metric):

         if current_depth == depth:
             file_names.extend([os.path.join(root, file) for file in files])
+    return set([file_name.lower().split("eval-queue/")[1] for file_name in file_names])
+def load_all_info_from_hub(QUEUE_REPO, RESULTS_REPO, QUEUE_PATH, RESULTS_PATH):
+    eval_queue_repo = None
+    eval_results_repo = None
     requested_models = None
     if H4_TOKEN:
         print("Pulling evaluation requests and results.")
+        eval_queue_repo = Repository(
+            local_dir=QUEUE_PATH,
+            clone_from=QUEUE_REPO,
+            use_auth_token=H4_TOKEN,
+            repo_type="dataset",
+        )
+        eval_queue_repo.git_pull()
+        eval_results_repo = Repository(
+            local_dir=RESULTS_PATH,
+            clone_from=RESULTS_REPO,
             use_auth_token=H4_TOKEN,
             repo_type="dataset",
         )
+        eval_results_repo.git_pull()
+        requested_models = get_all_requested_models("eval-queue")
+    else:
+        print("No HuggingFace token provided. Skipping evaluation requests and results.")
+    return eval_queue_repo, requested_models, eval_results_repo
 #def load_results(model, benchmark, metric):

src/utils_display.py CHANGED Viewed

@@ -15,17 +15,17 @@ def fields(raw_class):
 @dataclass(frozen=True)
 class AutoEvalColumn: # Auto evals column
     model = ColumnContent("Model", "markdown", True)
-    revision = ColumnContent("Revision", "str", True, True)
     model_type = ColumnContent("Type", "bool", False)
     is_8bit = ColumnContent("8bit", "bool", False, True)
     license = ColumnContent("Hub License", "str", False)
     params = ColumnContent("#Params (B)", "number", False)
     likes = ColumnContent("Hub ❤️", "number", False)
-    average = ColumnContent("Average ⬆️", "number", True)
-    arc = ColumnContent("ARC (25-s) ⬆️", "number", True)
-    hellaswag = ColumnContent("HellaSwag (10-s) ⬆️", "number", True)
-    mmlu = ColumnContent("MMLU (5-s) ⬆️", "number", True)
-    truthfulqa = ColumnContent("TruthfulQA (MC) (0-s) ⬆️", "number", True)
     dummy = ColumnContent("model_name_for_query", "str", True) # dummy col to implement search bar (hidden by custom CSS)
 @dataclass(frozen=True)

 @dataclass(frozen=True)
 class AutoEvalColumn: # Auto evals column
     model = ColumnContent("Model", "markdown", True)
+    average = ColumnContent("Average ⬆️", "number", True)
+    arc = ColumnContent("ARC ⬆️", "number", True)
+    hellaswag = ColumnContent("HellaSwag ⬆️", "number", True)
+    mmlu = ColumnContent("MMLU ⬆️", "number", True)
+    truthfulqa = ColumnContent("TruthfulQA (MC) ⬆️", "number", True)
     model_type = ColumnContent("Type", "bool", False)
     is_8bit = ColumnContent("8bit", "bool", False, True)
     license = ColumnContent("Hub License", "str", False)
     params = ColumnContent("#Params (B)", "number", False)
     likes = ColumnContent("Hub ❤️", "number", False)
+    revision = ColumnContent("Model sha", "str", False, False)
     dummy = ColumnContent("model_name_for_query", "str", True) # dummy col to implement search bar (hidden by custom CSS)
 @dataclass(frozen=True)