open_medical_llm_leaderboard

Running

App Files Files Community

aaditya commited on Mar 15

Commit

20d5de3

•

1 Parent(s): 6f322a8

add data

Browse files

Files changed (16) hide show

README.md +6 -2
app.py +46 -38
main_backend.py +78 -0
requirements.txt +4 -1
scripts/create_request_file.py +3 -5
scripts/fix_harness_import.py +11 -0
src/{display/about.py → about.py} +13 -23
src/backend/manage_requests.py +122 -0
src/backend/run_eval_suite.py +57 -0
src/backend/sort_queue.py +28 -0
src/display/formatting.py +0 -9
src/display/utils.py +13 -10
src/envs.py +11 -3
src/leaderboard/read_evals.py +1 -3
src/submission/check_validity.py +8 -21
src/submission/submit.py +2 -1

README.md CHANGED Viewed

@@ -12,7 +12,7 @@ license: apache-2.0
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
-Most of the variables to change for a default leaderboard are in env (replace the path for your leaderboard) and src/display/about.
 Results files should have the following format:
 ```
@@ -33,4 +33,8 @@ Results files should have the following format:
 }
 ```
-Request files are created automatically by this tool.

 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
+Most of the variables to change for a default leaderboard are in src/env (replace the path for your leaderboard) and src/about.
 Results files should have the following format:
 ```
 }
 ```
+Request files are created automatically by this tool.
+If you encounter problem on the space, don't hesitate to restart it to remove the create eval-queue, eval-queue-bk, eval-results and eval-results-bk created folder.
+If you want to run your own backend, you only need to change the logic in src/backend/run_eval_suite, which at the moment launches the Eleuther AI Harness.

app.py CHANGED Viewed

@@ -1,12 +1,39 @@
 import gradio as gr
 import pandas as pd
 from apscheduler.schedulers.background import BackgroundScheduler
 from huggingface_hub import snapshot_download
-# import os
 # # Define the folders to delete
-# folders_to_delete = ['eval-results', 'eval-queue']
 # import shutil
@@ -29,61 +56,41 @@ from huggingface_hub import snapshot_download
 # # Find and kill processes running on port 7878
 # try:
 #     # Find process using port 7878
-#     output = subprocess.check_output(["lsof", "-ti", "tcp:7878"]).decode().strip()
 #     if output:
 #         # Split the output in case there are multiple PIDs
 #         pids = output.split('\n')
 #         for pid in pids:
 #             # Kill each process
 #             os.kill(int(pid), signal.SIGKILL)
-#         result = "Processes running on port 7878 have been killed."
 #     else:
-#         result = "No processes are running on port 7878."
 # except Exception as e:
 #     result = f"An error occurred: {str(e)}"
-from src.display.about import (
-    CITATION_BUTTON_LABEL,
-    CITATION_BUTTON_TEXT,
-    EVALUATION_QUEUE_TEXT,
-    INTRODUCTION_TEXT,
-    LLM_BENCHMARKS_TEXT,
-    TITLE,
-)
-from src.display.css_html_js import custom_css
-from src.display.utils import (
-    BENCHMARK_COLS,
-    COLS,
-    EVAL_COLS,
-    EVAL_TYPES,
-    NUMERIC_INTERVALS,
-    TYPES,
-    AutoEvalColumn,
-    ModelType,
-    fields,
-    WeightType,
-    Precision
-)
-from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, TOKEN, QUEUE_REPO, REPO_ID, RESULTS_REPO
-from src.populate import get_evaluation_queue_df, get_leaderboard_df
-from src.submission.submit import add_new_eval
 def restart_space():
-    API.restart_space(repo_id=REPO_ID, token=TOKEN)
 try:
     print(EVAL_REQUESTS_PATH)
     snapshot_download(
-        repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
     )
 except Exception:
     restart_space()
 try:
     print(EVAL_RESULTS_PATH)
     snapshot_download(
-        repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
     )
 except Exception:
     restart_space()
@@ -154,12 +161,12 @@ def filter_models(
     df: pd.DataFrame, type_query: list, size_query: list, precision_query: list, show_deleted: bool
 ) -> pd.DataFrame:
     # Show all models
-    filtered_df = df
     # if show_deleted:
     #     filtered_df = df
     # else:  # Show only still on the hub models
     #     filtered_df = df[df[AutoEvalColumn.still_on_hub.name] == True]
     type_emoji = [t[0] for t in type_query]
     filtered_df = filtered_df.loc[df[AutoEvalColumn.model_type_symbol.name].isin(type_emoji)]
     filtered_df = filtered_df.loc[df[AutoEvalColumn.precision.name].isin(precision_query + ["None"])]
@@ -344,7 +351,7 @@ with demo:
                         choices=[i.value.name for i in Precision if i != Precision.Unknown],
                         label="Precision",
                         multiselect=False,
-                        value="float16",
                         interactive=True,
                     )
                     weight_type = gr.Dropdown(
@@ -383,11 +390,12 @@ with demo:
 scheduler = BackgroundScheduler()
 scheduler.add_job(restart_space, "interval", seconds=1800)
 scheduler.start()
 demo.queue(default_concurrency_limit=40).launch()
 # scheduler = BackgroundScheduler()
-# scheduler.add_job(restart_space, "interval", seconds=6 * 60 * 60)
 # scheduler.start()
-# demo.queue().launch()

+import subprocess
 import gradio as gr
 import pandas as pd
 from apscheduler.schedulers.background import BackgroundScheduler
 from huggingface_hub import snapshot_download
+from src.about import (
+    CITATION_BUTTON_LABEL,
+    CITATION_BUTTON_TEXT,
+    EVALUATION_QUEUE_TEXT,
+    INTRODUCTION_TEXT,
+    LLM_BENCHMARKS_TEXT,
+    TITLE,
+)
+from src.display.css_html_js import custom_css
+from src.display.utils import (
+    BENCHMARK_COLS,
+    COLS,
+    EVAL_COLS,
+    EVAL_TYPES,
+    NUMERIC_INTERVALS,
+    TYPES,
+    AutoEvalColumn,
+    ModelType,
+    fields,
+    WeightType,
+    Precision
+)
+from src.envs import API, DEVICE, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
+from src.populate import get_evaluation_queue_df, get_leaderboard_df
+from src.submission.submit import add_new_eval
+import os
 # # Define the folders to delete
+# folders_to_delete = ['eval-results', 'eval-queue', 'eval-queue-bk', 'eval-results-bk']
 # import shutil
 # # Find and kill processes running on port 7878
 # try:
 #     # Find process using port 7878
+#     output = subprocess.check_output(["lsof", "-ti", "tcp:7862"]).decode().strip()
 #     if output:
 #         # Split the output in case there are multiple PIDs
 #         pids = output.split('\n')
 #         for pid in pids:
 #             # Kill each process
 #             os.kill(int(pid), signal.SIGKILL)
+#         result = "Processes running on port 7862 have been killed."
 #     else:
+#         result = "No processes are running on port 7862."
 # except Exception as e:
 #     result = f"An error occurred: {str(e)}"
+subprocess.run(["python3", "scripts/fix_harness_import.py"])
 def restart_space():
+    API.restart_space(repo_id=REPO_ID)
+def launch_backend():
+    _ = subprocess.run(["python3", "main_backend.py"])
 try:
     print(EVAL_REQUESTS_PATH)
     snapshot_download(
+        repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
     )
 except Exception:
     restart_space()
 try:
     print(EVAL_RESULTS_PATH)
     snapshot_download(
+        repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
     )
 except Exception:
     restart_space()
     df: pd.DataFrame, type_query: list, size_query: list, precision_query: list, show_deleted: bool
 ) -> pd.DataFrame:
     # Show all models
     # if show_deleted:
     #     filtered_df = df
     # else:  # Show only still on the hub models
     #     filtered_df = df[df[AutoEvalColumn.still_on_hub.name] == True]
+    filtered_df = df
     type_emoji = [t[0] for t in type_query]
     filtered_df = filtered_df.loc[df[AutoEvalColumn.model_type_symbol.name].isin(type_emoji)]
     filtered_df = filtered_df.loc[df[AutoEvalColumn.precision.name].isin(precision_query + ["None"])]
                         choices=[i.value.name for i in Precision if i != Precision.Unknown],
                         label="Precision",
                         multiselect=False,
+                        value="float16" if DEVICE != "cpu" else "float32",
                         interactive=True,
                     )
                     weight_type = gr.Dropdown(
 scheduler = BackgroundScheduler()
 scheduler.add_job(restart_space, "interval", seconds=1800)
+scheduler.add_job(launch_backend, "interval", seconds=100) # will only allow one job to be run at the same time
 scheduler.start()
 demo.queue(default_concurrency_limit=40).launch()
 # scheduler = BackgroundScheduler()
+# scheduler.add_job(restart_space, "interval", seconds=1800)
+# scheduler.add_job(launch_backend, "interval", seconds=100) # will only allow one job to be run at the same time
 # scheduler.start()
+# demo.queue().launch()

main_backend.py ADDED Viewed

	@@ -0,0 +1,78 @@

+import logging
+import pprint
+from huggingface_hub import snapshot_download
+logging.getLogger("openai").setLevel(logging.WARNING)
+from src.backend.run_eval_suite import run_evaluation
+from src.backend.manage_requests import check_completed_evals, get_eval_requests, set_eval_request
+from src.backend.sort_queue import sort_models_by_priority
+from src.envs import QUEUE_REPO, EVAL_REQUESTS_PATH_BACKEND, RESULTS_REPO, EVAL_RESULTS_PATH_BACKEND, DEVICE, API, LIMIT, TOKEN
+from src.about import Tasks, NUM_FEWSHOT
+TASKS_HARNESS = [task.value.benchmark for task in Tasks]
+logging.basicConfig(level=logging.ERROR)
+pp = pprint.PrettyPrinter(width=80)
+PENDING_STATUS = "PENDING"
+RUNNING_STATUS = "RUNNING"
+FINISHED_STATUS = "FINISHED"
+FAILED_STATUS = "FAILED"
+snapshot_download(repo_id=RESULTS_REPO, revision="main", local_dir=EVAL_RESULTS_PATH_BACKEND, repo_type="dataset", max_workers=60, token=TOKEN)
+snapshot_download(repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60, token=TOKEN)
+def run_auto_eval():
+    current_pending_status = [PENDING_STATUS]
+    # pull the eval dataset from the hub and parse any eval requests
+    # check completed evals and set them to finished
+    check_completed_evals(
+        api=API,
+        checked_status=RUNNING_STATUS,
+        completed_status=FINISHED_STATUS,
+        failed_status=FAILED_STATUS,
+        hf_repo=QUEUE_REPO,
+        local_dir=EVAL_REQUESTS_PATH_BACKEND,
+        hf_repo_results=RESULTS_REPO,
+        local_dir_results=EVAL_RESULTS_PATH_BACKEND
+    )
+    # Get all eval request that are PENDING, if you want to run other evals, change this parameter
+    eval_requests = get_eval_requests(job_status=current_pending_status, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND)
+    # Sort the evals by priority (first submitted first run)
+    eval_requests = sort_models_by_priority(api=API, models=eval_requests)
+    print(f"Found {len(eval_requests)} {','.join(current_pending_status)} eval requests")
+    if len(eval_requests) == 0:
+        return
+    eval_request = eval_requests[0]
+    pp.pprint(eval_request)
+    set_eval_request(
+        api=API,
+        eval_request=eval_request,
+        set_to_status=RUNNING_STATUS,
+        hf_repo=QUEUE_REPO,
+        local_dir=EVAL_REQUESTS_PATH_BACKEND,
+    )
+    run_evaluation(
+        eval_request=eval_request,
+        task_names=TASKS_HARNESS,
+        num_fewshot=NUM_FEWSHOT,
+        local_dir=EVAL_RESULTS_PATH_BACKEND,
+        results_repo=RESULTS_REPO,
+        batch_size=1,
+        device=DEVICE,
+        no_cache=True,
+        limit=LIMIT
+        )
+if __name__ == "__main__":
+    run_auto_eval()

requirements.txt CHANGED Viewed

@@ -12,4 +12,7 @@ python-dateutil==2.8.2
 requests==2.28.2
 tqdm==4.65.0
 transformers
-tokenizers>=0.15.0

 requests==2.28.2
 tqdm==4.65.0
 transformers
+tokenizers>=0.15.0
+git+https://github.com/EleutherAI/lm-evaluation-harness.git@b281b0921b636bc36ad05c0b0b0763bd6dd43463#egg=lm-eval
+accelerate==0.24.1
+sentencepiece

scripts/create_request_file.py CHANGED Viewed

@@ -7,11 +7,9 @@ from datetime import datetime, timezone
 import click
 from colorama import Fore
 from huggingface_hub import HfApi, snapshot_download
-EVAL_REQUESTS_PATH = "eval-queue"
-QUEUE_REPO = "open-llm-leaderboard/requests"
-precisions = ("float16", "bfloat16", "8bit (LLM.int8)", "4bit (QLoRA / FP4)", "GPTQ")
 model_types = ("pretrained", "fine-tuned", "RL-tuned", "instruction-tuned")
 weight_types = ("Original", "Delta", "Adapter")
@@ -36,7 +34,7 @@ def get_model_size(model_info, precision: str):
 def main():
     api = HfApi()
     current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
-    snapshot_download(repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH, repo_type="dataset")
     model_name = click.prompt("Enter model name")
     revision = click.prompt("Enter revision", default="main")

 import click
 from colorama import Fore
 from huggingface_hub import HfApi, snapshot_download
+from src.envs import TOKEN, EVAL_REQUESTS_PATH, QUEUE_REPO
+precisions = ("float16", "bfloat16", "8bit (LLM.int8)", "4bit (QLoRA / FP4)", "GPTQ", "float32")
 model_types = ("pretrained", "fine-tuned", "RL-tuned", "instruction-tuned")
 weight_types = ("Original", "Delta", "Adapter")
 def main():
     api = HfApi()
     current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
+    snapshot_download(repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", token=TOKEN)
     model_name = click.prompt("Enter model name")
     revision = click.prompt("Enter revision", default="main")

scripts/fix_harness_import.py ADDED Viewed

	@@ -0,0 +1,11 @@

+"""This file should be used after pip install -r requirements.
+It creates a folder not ported during harness package creation (as they don't use a Manifest file atm and it ignore `.json` files).
+It will need to be updated if we want to use the harness' version of big bench to actually copy the json files.
+"""
+import os
+import lm_eval
+if __name__ == "__main__":
+    lm_eval_path = lm_eval.__path__[0]
+    os.makedirs(os.path.join(lm_eval_path, "datasets", "bigbench_resources"), exist_ok=True)

src/{display/about.py → about.py} RENAMED Viewed

@@ -1,7 +1,5 @@
 from dataclasses import dataclass
 from enum import Enum
-# from src.display.utils import ModelType
 @dataclass
 class Task:
@@ -10,7 +8,8 @@ class Task:
     col_name: str
-# Init: to update with your specific keys
 class Tasks(Enum):
     # task_key in the json file, metric_key in the json file, name to display in the leaderboard
     task0        = Task("medmcqa", "acc,none", "MedMCQA")
@@ -23,18 +22,20 @@ class Tasks(Enum):
     task7        = Task("mmlu_professional_medicine", "acc,none", "MMLU Professional Medicine")
     task8        = Task("pubmedqa", "acc,none", "PubMedQA")
-# "medmcqa", "acc,none", "MedMCQA"
-# Your leaderboard name
-# <h1 align="center" style="color: #1a237e;"> Open Medical-LLM Leaderboard</h1>
 TITLE = """
 <div style="text-align: center; margin-bottom: 20px;">
     <img src="https://raw.githubusercontent.com/monk1337/MultiMedQA/main/assets/logs.png" alt="Descriptive Alt Text" style="display: block; margin: auto; height: 160px;">
 </div>
 <h1 align="center" style="color: #1a237e; font-size: 40px;">Open <span style="color: #990001;">Medical-LLM</span> Leaderboard</h1>
@@ -44,15 +45,17 @@ TITLE = """
 INTRODUCTION_TEXT = """
 🩺 The Open Medical LLM Leaderboard aims to track, rank and evaluate the performance of large language models (LLMs) on medical question answering tasks. It evaluates LLMs across a diverse array of medical datasets, including MedQA (USMLE), PubMedQA, MedMCQA, and subsets of MMLU related to medicine and biology. The leaderboard offers a comprehensive assessment of each model's medical knowledge and question answering capabilities.
 The datasets cover various aspects of medicine such as general medical knowledge, clinical knowledge, anatomy, genetics, and more. They contain multiple-choice and open-ended questions that require medical reasoning and understanding. More details on the datasets can be found in the "LLM Benchmarks Details" section below.
 The main evaluation metric used is Accuracy (ACC). Submit a model for automated evaluation on the "Submit" page. If you have comments or suggestions on additional medical datasets to include, please reach out to us in our discussion forum.
 The backend of the Open Medical LLM Leaderboard uses the Eleuther AI Language Model Evaluation Harness. More technical details can be found in the "About" page.
 """
 LLM_BENCHMARKS_TEXT = f"""
 <h2 style="color: #2c3e50;"> Why Leaderboard? </h2>
 Evaluating the medical knowledge and clinical reasoning capabilities of LLMs is crucial as they are increasingly being applied to healthcare and biomedical applications. The Open Medical LLM Leaderboard provides a platform to assess the latest LLMs on their performance on a variety of medical question answering tasks. This can help identify the strengths and gaps in medical understanding of current models.
@@ -64,18 +67,15 @@ Evaluating the medical knowledge and clinical reasoning capabilities of LLMs is
 <h2 style="color: #2c3e50;">About Open Life Science AI</h2>
 An Open Life Science Project to Benchmark and Track AI Progress, Share Models and Datasets in the Life Science Field.
 <a href="https://openlifescience.ai/" target="_blank"> More info </a>
 <h2 style="color: #2c3e50;">Datasets</h2>
 <div style="font-family: Arial, sans-serif; line-height: 1.6; color: #333;"> <ul style="list-style-type: none; padding: 0;"> <li style="margin-bottom: 20px;"> <h3 style="color: #2c3e50; margin-bottom: 5px;"><a href="https://arxiv.org/abs/2009.13081" target="_blank" style="color: #3498db;">MedQA (USMLE)</a></h3> <p>1273 real-world questions from the US Medical License Exams (USMLE) to test general medical knowledge</p> </li> <li style="margin-bottom: 20px;"> <h3 style="color: #2c3e50; margin-bottom: 5px;"><a href="https://arxiv.org/abs/1909.06146" target="_blank" style="color: #3498db;">PubMedQA</a></h3> <p>500 questions constructed from PubMed article titles along with the abstracts as context to test understanding of biomedical research</p> </li> <li style="margin-bottom: 20px;"> <h3 style="color: #2c3e50; margin-bottom: 5px;"><a href="https://proceedings.mlr.press/v174/pal22a.html" target="_blank" style="color: #3498db;">MedMCQA</a></h3> <p>4183 questions from Indian medical entrance exams (AIIMS & NEET PG) spanning 2.4k healthcare topics</p> </li> <li style="margin-bottom: 20px;"> <h3 style="color: #2c3e50; margin-bottom: 5px;"><a href="https://arxiv.org/abs/2009.03300" target="_blank" style="color: #3498db;">MMLU-Clinical knowledge</a></h3> <p>265 multiple choice questions on clinical knowledge</p> </li> <li style="margin-bottom: 20px;"> <h3 style="color: #2c3e50; margin-bottom: 5px;"><a href="https://arxiv.org/abs/2009.03300" target="_blank" style="color: #3498db;">MMLU-Medical genetics</a></h3> <p>100 MCQs on medical genetics</p> </li> <li style="margin-bottom: 20px;"> <h3 style="color: #2c3e50; margin-bottom: 5px;"><a href="https://arxiv.org/abs/2009.03300" target="_blank" style="color: #3498db;">MMLU-Anatomy</a></h3> <p>135 anatomy MCQs</p> </li> <li style="margin-bottom: 20px;"> <h3 style="color: #2c3e50; margin-bottom: 5px;"><a href="https://arxiv.org/abs/2009.03300" target="_blank" style="color: #3498db;">MMLU-Professional medicine</a></h3> <p>272 MCQs on professional medicine</p> </li> <li style="margin-bottom: 20px;"> <h3 style="color: #2c3e50; margin-bottom: 5px;"><a href="https://arxiv.org/abs/2009.03300" target="_blank" style="color: #3498db;">MMLU-College biology</a></h3> <p>144 MCQs on college-level biology</p> </li> <li> <h3 style="color: #2c3e50; margin-bottom: 5px;"><a href="https://arxiv.org/abs/2009.03300" target="_blank" style="color: #3498db;">MMLU-College medicine</a></h3> <p>173 college medicine MCQs</p> </li> </ul> </div>
-<div style="font-family: Arial, sans-serif; line-height: 1.6; color: #333;"> <h2 style="color: #2c3e50;">Evaluation Metric</h2> <p>Metric Accuracy (ACC) is used as the main evaluation metric across all datasets.</p> <h2 style="color: #2c3e50;">Details and Logs</h2> <p>Detailed results are available in the results directory:</p> <a href="https://huggingface.co/datasets/openlifescienceai/results" target="_blank" style="color: #3498db;">https://huggingface.co/datasets/openlifescienceai/results</a> <p>Input/outputs for each model can be found in the details page accessible by clicking the 📄 emoji next to the model name.</p> <h2 style="color: #2c3e50;">Reproducibility</h2> <p>To reproduce the results, you can run this evaluation script:</p> <pre style="background-color: #f0f0f0; padding: 10px; border-radius: 5px;">python eval_medical_llm.py</pre> <p>To evaluate a specific dataset on a model, use the EleutherAI LLM Evaluation Harness:</p> <pre style="background-color: #f0f0f0; padding: 10px; border-radius: 5px;">python main.py --model=hf-auto --model_args="pretrained=&lt;model&gt;,revision=&lt;revision&gt;,parallelize=True" --tasks=&lt;dataset&gt; --num_fewshot=&lt;n_shots&gt; --batch_size=1 --output_path=&lt;output_dir&gt;</pre> <p>Note some datasets may require additional setup, refer to the Evaluation Harness documentation.</p> <p>Adjust batch size based on your GPU memory if not using parallelism. Minor variations in results are expected with different batch sizes due to padding.</p> <h2 style="color: #2c3e50;">Icons</h2> <ul style="list-style-type: none; padding: 0;"> <li>🟢 Pre-trained model</li> <li>🔶 Fine-tuned model</li> <li>? Unknown model type</li> <li>⭕ Instruction-tuned</li> <li>🟦 RL-tuned</li> </ul> <p>Missing icons indicate the model info is not yet added, feel free to open an issue to include it!</p> </div>
 """
 LLM_BENCHMARKS_DETAILS = f"""
 Datasets
 <a href="https://arxiv.org/abs/2009.13081" target="_blank">MedQA (USMLE)</a> - 1273 real-world questions from the US Medical License Exams (USMLE) to test general medical knowledge
 <a href="https://arxiv.org/abs/1909.06146" target="_blank">PubMedQA</a> - 500 questions constructed from PubMed article titles along with the abstracts as context to test understanding of biomedical research
@@ -93,14 +93,10 @@ Detailed results are available in the results directory: https://huggingface.co/
 Input/outputs for each model can be found in the details page accessible by clicking the 📄 emoji next to the model name
 Reproducibility
 To reproduce the results, you can run this evaluation script: python eval_medical_llm.py.
 To evaluate a specific dataset on a model, use the EleutherAI LLM Evaluation Harness:
 python main.py --model=hf-auto --model_args="pretrained=<model>,revision=<revision>,parallelize=True"
  --tasks=<dataset> --num_fewshot=<n_shots> --batch_size=1 --output_path=<output_dir>
 Note some datasets may require additional setup, refer to the Evaluation Harness documentation. Adjust batch size based on your GPU memory if not using parallelism. Minor variations in results are expected with different batch sizes due to padding.
 Icons
 🟢 Pre-trained model
 🔶 Fine-tuned model
@@ -114,28 +110,24 @@ FAQ_TEXT = """
 FAQ
 1) Submitting a model
 XXX
 2) Model results
 XXX
 3) Editing a submission
 XXX
 """
 EVALUATION_QUEUE_TEXT = """
 Evaluation Queue for the Open Medical LLM Leaderboard
 Models added here will be automatically evaluated.
 Before submitting a model
 1) Verify loading with AutoClasses:
-python
-Copy code
 from transformers import AutoConfig, AutoModel, AutoTokenizer
 config = AutoConfig.from_pretrained("model-name", revision=revision)
 model = AutoModel.from_pretrained("model-name", revision=revision)
 tokenizer = AutoTokenizer.from_pretrained("model-name", revision=revision)
 Debug any loading errors before submission. Make sure the model is public.
@@ -162,7 +154,6 @@ year = {2024},
 publisher = {Hugging Face},
 howpublished = "\url{https://huggingface.co/spaces/openlifescienceai/open_medical_llm_leaderboard}"
 }
 @misc{singhal2022large,
       title={Large Language Models Encode Clinical Knowledge},
       author={Karan Singhal et al.},
@@ -171,5 +162,4 @@ howpublished = "\url{https://huggingface.co/spaces/openlifescienceai/open_medica
       archivePrefix={arXiv},
       primaryClass={cs.CL}
 }
 """

 from dataclasses import dataclass
 from enum import Enum
 @dataclass
 class Task:
     col_name: str
+# Select your tasks here
+# ---------------------------------------------------
 class Tasks(Enum):
     # task_key in the json file, metric_key in the json file, name to display in the leaderboard
     task0        = Task("medmcqa", "acc,none", "MedMCQA")
     task7        = Task("mmlu_professional_medicine", "acc,none", "MMLU Professional Medicine")
     task8        = Task("pubmedqa", "acc,none", "PubMedQA")
+NUM_FEWSHOT = 0 # Change with your few shot
+# ---------------------------------------------------
 TITLE = """
 <div style="text-align: center; margin-bottom: 20px;">
     <img src="https://raw.githubusercontent.com/monk1337/MultiMedQA/main/assets/logs.png" alt="Descriptive Alt Text" style="display: block; margin: auto; height: 160px;">
 </div>
 <h1 align="center" style="color: #1a237e; font-size: 40px;">Open <span style="color: #990001;">Medical-LLM</span> Leaderboard</h1>
 INTRODUCTION_TEXT = """
 🩺 The Open Medical LLM Leaderboard aims to track, rank and evaluate the performance of large language models (LLMs) on medical question answering tasks. It evaluates LLMs across a diverse array of medical datasets, including MedQA (USMLE), PubMedQA, MedMCQA, and subsets of MMLU related to medicine and biology. The leaderboard offers a comprehensive assessment of each model's medical knowledge and question answering capabilities.
 The datasets cover various aspects of medicine such as general medical knowledge, clinical knowledge, anatomy, genetics, and more. They contain multiple-choice and open-ended questions that require medical reasoning and understanding. More details on the datasets can be found in the "LLM Benchmarks Details" section below.
 The main evaluation metric used is Accuracy (ACC). Submit a model for automated evaluation on the "Submit" page. If you have comments or suggestions on additional medical datasets to include, please reach out to us in our discussion forum.
 The backend of the Open Medical LLM Leaderboard uses the Eleuther AI Language Model Evaluation Harness. More technical details can be found in the "About" page.
 """
 LLM_BENCHMARKS_TEXT = f"""
 <h2 style="color: #2c3e50;"> Why Leaderboard? </h2>
 Evaluating the medical knowledge and clinical reasoning capabilities of LLMs is crucial as they are increasingly being applied to healthcare and biomedical applications. The Open Medical LLM Leaderboard provides a platform to assess the latest LLMs on their performance on a variety of medical question answering tasks. This can help identify the strengths and gaps in medical understanding of current models.
 <h2 style="color: #2c3e50;">About Open Life Science AI</h2>
 An Open Life Science Project to Benchmark and Track AI Progress, Share Models and Datasets in the Life Science Field.
 <a href="https://openlifescience.ai/" target="_blank"> More info </a>
 <h2 style="color: #2c3e50;">Datasets</h2>
 <div style="font-family: Arial, sans-serif; line-height: 1.6; color: #333;"> <ul style="list-style-type: none; padding: 0;"> <li style="margin-bottom: 20px;"> <h3 style="color: #2c3e50; margin-bottom: 5px;"><a href="https://arxiv.org/abs/2009.13081" target="_blank" style="color: #3498db;">MedQA (USMLE)</a></h3> <p>1273 real-world questions from the US Medical License Exams (USMLE) to test general medical knowledge</p> </li> <li style="margin-bottom: 20px;"> <h3 style="color: #2c3e50; margin-bottom: 5px;"><a href="https://arxiv.org/abs/1909.06146" target="_blank" style="color: #3498db;">PubMedQA</a></h3> <p>500 questions constructed from PubMed article titles along with the abstracts as context to test understanding of biomedical research</p> </li> <li style="margin-bottom: 20px;"> <h3 style="color: #2c3e50; margin-bottom: 5px;"><a href="https://proceedings.mlr.press/v174/pal22a.html" target="_blank" style="color: #3498db;">MedMCQA</a></h3> <p>4183 questions from Indian medical entrance exams (AIIMS & NEET PG) spanning 2.4k healthcare topics</p> </li> <li style="margin-bottom: 20px;"> <h3 style="color: #2c3e50; margin-bottom: 5px;"><a href="https://arxiv.org/abs/2009.03300" target="_blank" style="color: #3498db;">MMLU-Clinical knowledge</a></h3> <p>265 multiple choice questions on clinical knowledge</p> </li> <li style="margin-bottom: 20px;"> <h3 style="color: #2c3e50; margin-bottom: 5px;"><a href="https://arxiv.org/abs/2009.03300" target="_blank" style="color: #3498db;">MMLU-Medical genetics</a></h3> <p>100 MCQs on medical genetics</p> </li> <li style="margin-bottom: 20px;"> <h3 style="color: #2c3e50; margin-bottom: 5px;"><a href="https://arxiv.org/abs/2009.03300" target="_blank" style="color: #3498db;">MMLU-Anatomy</a></h3> <p>135 anatomy MCQs</p> </li> <li style="margin-bottom: 20px;"> <h3 style="color: #2c3e50; margin-bottom: 5px;"><a href="https://arxiv.org/abs/2009.03300" target="_blank" style="color: #3498db;">MMLU-Professional medicine</a></h3> <p>272 MCQs on professional medicine</p> </li> <li style="margin-bottom: 20px;"> <h3 style="color: #2c3e50; margin-bottom: 5px;"><a href="https://arxiv.org/abs/2009.03300" target="_blank" style="color: #3498db;">MMLU-College biology</a></h3> <p>144 MCQs on college-level biology</p> </li> <li> <h3 style="color: #2c3e50; margin-bottom: 5px;"><a href="https://arxiv.org/abs/2009.03300" target="_blank" style="color: #3498db;">MMLU-College medicine</a></h3> <p>173 college medicine MCQs</p> </li> </ul> </div>
+<div style="font-family: Arial, sans-serif; line-height: 1.6; color: #333;"> <h2 style="color: #2c3e50;">Evaluation Metric</h2> <p>Metric Accuracy (ACC) is used as the main evaluation metric across all datasets.</p> <h2 style="color: #2c3e50;">Details and Logs</h2> <p>Detailed results are available in the results directory:</p> <a href="https://huggingface.co/datasets/openlifescienceai/results" target="_blank" style="color: #3498db;">https://huggingface.co/datasets/openlifescienceai/results</a> <p>Input/outputs for each model can be found in the details page accessible by clicking the 📄 emoji next to the model name.</p> <h2 style="color: #2c3e50;">Reproducibility</h2> <p>To reproduce the results, you can run this evaluation script:</p> <pre style="background-color: #f0f0f0; padding: 10px; border-radius: 5px;">python eval_medical_llm.py</pre> <p>To evaluate a specific dataset on a model, use the EleutherAI LLM Evaluation Harness:</p> <pre style="background-color: #f0f0f0; padding: 10px; border-radius: 5px;">python main.py --model=hf-auto --model_args="pretrained=&lt;model&gt;,revision=&lt;revision&gt;,parallelize=True" --tasks=&lt;dataset&gt; --num_fewshot=&lt;n_shots&gt; --batch_size=1 --output_path=&lt;output_dir&gt;</pre> <p>Note some datasets may require additional setup, refer to the Evaluation Harness documentation.</p> <p>Adjust batch size based on your GPU memory if not using parallelism. Minor variations in results are expected with different batch sizes due to padding.</p> <h2 style="color: #2c3e50;">Icons</h2> <ul style="list-style-type: none; padding: 0;"> <li>🟢 Pre-trained model</li> <li>🔶 Fine-tuned model</li> <li>? Unknown model type</li> <li>⭕ Instruction-tuned</li> <li>🟦 RL-tuned</li> </ul> <p>Missing icons indicate the model info is not yet added, feel free to open an issue to include it!</p> </div>
 """
 LLM_BENCHMARKS_DETAILS = f"""
 Datasets
 <a href="https://arxiv.org/abs/2009.13081" target="_blank">MedQA (USMLE)</a> - 1273 real-world questions from the US Medical License Exams (USMLE) to test general medical knowledge
 <a href="https://arxiv.org/abs/1909.06146" target="_blank">PubMedQA</a> - 500 questions constructed from PubMed article titles along with the abstracts as context to test understanding of biomedical research
 Input/outputs for each model can be found in the details page accessible by clicking the 📄 emoji next to the model name
 Reproducibility
 To reproduce the results, you can run this evaluation script: python eval_medical_llm.py.
 To evaluate a specific dataset on a model, use the EleutherAI LLM Evaluation Harness:
 python main.py --model=hf-auto --model_args="pretrained=<model>,revision=<revision>,parallelize=True"
  --tasks=<dataset> --num_fewshot=<n_shots> --batch_size=1 --output_path=<output_dir>
 Note some datasets may require additional setup, refer to the Evaluation Harness documentation. Adjust batch size based on your GPU memory if not using parallelism. Minor variations in results are expected with different batch sizes due to padding.
 Icons
 🟢 Pre-trained model
 🔶 Fine-tuned model
 FAQ
 1) Submitting a model
 XXX
 2) Model results
 XXX
 3) Editing a submission
 XXX
 """
 EVALUATION_QUEUE_TEXT = """
 Evaluation Queue for the Open Medical LLM Leaderboard
 Models added here will be automatically evaluated.
 Before submitting a model
 1) Verify loading with AutoClasses:
 from transformers import AutoConfig, AutoModel, AutoTokenizer
 config = AutoConfig.from_pretrained("model-name", revision=revision)
 model = AutoModel.from_pretrained("model-name", revision=revision)
 tokenizer = AutoTokenizer.from_pretrained("model-name", revision=revision)
 Debug any loading errors before submission. Make sure the model is public.
 publisher = {Hugging Face},
 howpublished = "\url{https://huggingface.co/spaces/openlifescienceai/open_medical_llm_leaderboard}"
 }
 @misc{singhal2022large,
       title={Large Language Models Encode Clinical Knowledge},
       author={Karan Singhal et al.},
       archivePrefix={arXiv},
       primaryClass={cs.CL}
 }
 """

src/backend/manage_requests.py ADDED Viewed

	@@ -0,0 +1,122 @@

+import glob
+import json
+from dataclasses import dataclass
+from typing import Optional
+from huggingface_hub import HfApi, snapshot_download
+from src.envs import TOKEN
+@dataclass
+class EvalRequest:
+    model: str
+    private: bool
+    status: str
+    json_filepath: str
+    weight_type: str = "Original"
+    model_type: str = ""  # pretrained, finetuned, with RL
+    precision: str = ""  # float16, bfloat16
+    base_model: Optional[str] = None # for adapter models
+    revision: str = "main" # commit
+    submitted_time: Optional[str] = "2022-05-18T11:40:22.519222"  # random date just so that we can still order requests by date
+    model_type: Optional[str] = None
+    likes: Optional[int] = 0
+    params: Optional[int] = None
+    license: Optional[str] = ""
+    def get_model_args(self):
+        model_args = f"pretrained={self.model},revision={self.revision}"
+        if self.precision in ["float16", "bfloat16", "float32"]:
+            model_args += f",dtype={self.precision}"
+        # Quantized models need some added config, the install of bits and bytes, etc
+        #elif self.precision == "8bit":
+        #    model_args += ",load_in_8bit=True"
+        #elif self.precision == "4bit":
+        #    model_args += ",load_in_4bit=True"
+        #elif self.precision == "GPTQ":
+            # A GPTQ model does not need dtype to be specified,
+            # it will be inferred from the config
+            pass
+        else:
+            raise Exception(f"Unknown precision {self.precision}.")
+        return model_args
+def set_eval_request(api: HfApi, eval_request: EvalRequest, set_to_status: str, hf_repo: str, local_dir: str):
+    """Updates a given eval request with its new status on the hub (running, completed, failed, ...)"""
+    json_filepath = eval_request.json_filepath
+    with open(json_filepath) as fp:
+        data = json.load(fp)
+    data["status"] = set_to_status
+    with open(json_filepath, "w") as f:
+        f.write(json.dumps(data))
+    api.upload_file(
+        path_or_fileobj=json_filepath,
+        path_in_repo=json_filepath.replace(local_dir, ""),
+        repo_id=hf_repo,
+        repo_type="dataset",
+    )
+def get_eval_requests(job_status: list, local_dir: str, hf_repo: str) -> list[EvalRequest]:
+    """Get all pending evaluation requests and return a list in which private
+    models appearing first, followed by public models sorted by the number of
+    likes.
+    Returns:
+        `list[EvalRequest]`: a list of model info dicts.
+    """
+    snapshot_download(repo_id=hf_repo, revision="main", local_dir=local_dir, repo_type="dataset", max_workers=60, token=TOKEN)
+    json_files = glob.glob(f"{local_dir}/**/*.json", recursive=True)
+    eval_requests = []
+    for json_filepath in json_files:
+        with open(json_filepath) as fp:
+            data = json.load(fp)
+        if data["status"] in job_status:
+            data["json_filepath"] = json_filepath
+            eval_request = EvalRequest(**data)
+            eval_requests.append(eval_request)
+    return eval_requests
+def check_completed_evals(
+    api: HfApi,
+    hf_repo: str,
+    local_dir: str,
+    checked_status: str,
+    completed_status: str,
+    failed_status: str,
+    hf_repo_results: str,
+    local_dir_results: str,
+):
+    """Checks if the currently running evals are completed, if yes, update their status on the hub."""
+    snapshot_download(repo_id=hf_repo_results, revision="main", local_dir=local_dir_results, repo_type="dataset", max_workers=60, token=TOKEN)
+    running_evals = get_eval_requests(checked_status, hf_repo=hf_repo, local_dir=local_dir)
+    for eval_request in running_evals:
+        model = eval_request.model
+        print("====================================")
+        print(f"Checking {model}")
+        output_path = model
+        output_file = f"{local_dir_results}/{output_path}/results*.json"
+        output_file_exists = len(glob.glob(output_file)) > 0
+        if output_file_exists:
+            print(
+                f"EXISTS output file exists for {model} setting it to {completed_status}"
+            )
+            set_eval_request(api, eval_request, completed_status, hf_repo, local_dir)
+        else:
+            print(
+                f"No result file found for {model} setting it to {failed_status}"
+            )
+            set_eval_request(api, eval_request, failed_status, hf_repo, local_dir)

src/backend/run_eval_suite.py ADDED Viewed

	@@ -0,0 +1,57 @@

+import json
+import os
+import logging
+from datetime import datetime
+from lm_eval import tasks, evaluator, utils
+from src.envs import RESULTS_REPO, API
+from src.backend.manage_requests import EvalRequest
+logging.getLogger("openai").setLevel(logging.WARNING)
+def run_evaluation(eval_request: EvalRequest, task_names, num_fewshot, batch_size, device, local_dir: str, results_repo: str, no_cache=True, limit=None):
+    if limit:
+        print(
+            "WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT."
+        )
+    task_names = utils.pattern_match(task_names, tasks.ALL_TASKS)
+    print(f"Selected Tasks: {task_names}")
+    results = evaluator.simple_evaluate(
+        model="hf-causal-experimental", # "hf-causal"
+        model_args=eval_request.get_model_args(),
+        tasks=task_names,
+        num_fewshot=num_fewshot,
+        batch_size=batch_size,
+        device=device,
+        no_cache=no_cache,
+        limit=limit,
+        write_out=True,
+        output_base_path="logs"
+    )
+    results["config"]["model_dtype"] = eval_request.precision
+    results["config"]["model_name"] = eval_request.model
+    results["config"]["model_sha"] = eval_request.revision
+    dumped = json.dumps(results, indent=2)
+    print(dumped)
+    output_path = os.path.join(local_dir, *eval_request.model.split("/"), f"results_{datetime.now()}.json")
+    os.makedirs(os.path.dirname(output_path), exist_ok=True)
+    with open(output_path, "w") as f:
+        f.write(dumped)
+    print(evaluator.make_table(results))
+    API.upload_file(
+        path_or_fileobj=output_path,
+        path_in_repo=f"{eval_request.model}/results_{datetime.now()}.json",
+        repo_id=results_repo,
+        repo_type="dataset",
+    )
+    return results

src/backend/sort_queue.py ADDED Viewed

	@@ -0,0 +1,28 @@

+import re
+from dataclasses import dataclass
+from huggingface_hub import HfApi
+from src.backend.manage_requests import EvalRequest
+@dataclass
+class ModelMetadata:
+    likes: int = 0
+    size: int = 15
+def sort_models_by_priority(api: HfApi, models: list[EvalRequest]) -> list[EvalRequest]:
+    private_models = [model for model in models if model.private]
+    public_models = [model for model in models if not model.private]
+    return sort_by_submit_date(private_models) + sort_by_submit_date(public_models)
+def sort_by_submit_date(eval_requests: list[EvalRequest]) -> list[EvalRequest]:
+    return sorted(eval_requests, key=lambda x: x.submitted_time, reverse=False)
+def sort_by_size(eval_requests: list[EvalRequest]) -> list[EvalRequest]:
+    return sorted(eval_requests, key=lambda x: x.params, reverse=False)
+def sort_by_likes(eval_requests: list[EvalRequest]) -> list[EvalRequest]:
+    return sorted(eval_requests, key=lambda x: x.likes, reverse=False)

src/display/formatting.py CHANGED Viewed

@@ -1,12 +1,3 @@
-import os
-from datetime import datetime, timezone
-from huggingface_hub import HfApi
-from huggingface_hub.hf_api import ModelInfo
-API = HfApi()
 def model_hyperlink(link, model_name):
     return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'











1	def model_hyperlink(link, model_name):
2	return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
3

src/display/utils.py CHANGED Viewed

@@ -3,7 +3,7 @@ from enum import Enum
 import pandas as pd
-from src.display.about import Tasks
 def fields(raw_class):
     return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
@@ -94,9 +94,10 @@ class WeightType(Enum):
 class Precision(Enum):
     float16 = ModelDetails("float16")
     bfloat16 = ModelDetails("bfloat16")
-    qt_8bit = ModelDetails("8bit")
-    qt_4bit = ModelDetails("4bit")
-    qt_GPTQ = ModelDetails("GPTQ")
     Unknown = ModelDetails("?")
     def from_str(precision):
@@ -104,12 +105,14 @@ class Precision(Enum):
             return Precision.float16
         if precision in ["torch.bfloat16", "bfloat16"]:
             return Precision.bfloat16
-        if precision in ["8bit"]:
-            return Precision.qt_8bit
-        if precision in ["4bit"]:
-            return Precision.qt_4bit
-        if precision in ["GPTQ", "None"]:
-            return Precision.qt_GPTQ
         return Precision.Unknown
 # Column selection

 import pandas as pd
+from src.about import Tasks
 def fields(raw_class):
     return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
 class Precision(Enum):
     float16 = ModelDetails("float16")
     bfloat16 = ModelDetails("bfloat16")
+    float32 = ModelDetails("float32")
+    #qt_8bit = ModelDetails("8bit")
+    #qt_4bit = ModelDetails("4bit")
+    #qt_GPTQ = ModelDetails("GPTQ")
     Unknown = ModelDetails("?")
     def from_str(precision):
             return Precision.float16
         if precision in ["torch.bfloat16", "bfloat16"]:
             return Precision.bfloat16
+        if precision in ["float32"]:
+            return Precision.float32
+        #if precision in ["8bit"]:
+        #    return Precision.qt_8bit
+        #if precision in ["4bit"]:
+        #    return Precision.qt_4bit
+        #if precision in ["GPTQ", "None"]:
+        #    return Precision.qt_GPTQ
         return Precision.Unknown
 # Column selection

src/envs.py CHANGED Viewed

@@ -2,19 +2,27 @@ import os
 from huggingface_hub import HfApi
-# clone / pull the lmeh eval data
-TOKEN = os.environ.get("TOKEN", None)
 OWNER = "openlifescienceai"
 REPO_ID = f"{OWNER}/open_medical_llm_leaderboard"
 QUEUE_REPO = f"{OWNER}/requests"
 RESULTS_REPO = f"{OWNER}/results"
 CACHE_PATH=os.getenv("HF_HOME", ".")
-# print("CACHE_PATH", CACHE_PATH)
 # Local caches
 EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
 EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
 API = HfApi(token=TOKEN)

 from huggingface_hub import HfApi
+# Info to change for your repository
+# ----------------------------------
+TOKEN = os.environ.get("TOKEN") # A read/write token for your org
+DEVICE = "cpu" # "cuda:0" if you add compute
+LIMIT = None # !!!! Should be None for actual evaluations!!!
+# ----------------------------------
 OWNER = "openlifescienceai"
 REPO_ID = f"{OWNER}/open_medical_llm_leaderboard"
 QUEUE_REPO = f"{OWNER}/requests"
 RESULTS_REPO = f"{OWNER}/results"
+# If you setup a cache later, just change HF_HOME
 CACHE_PATH=os.getenv("HF_HOME", ".")
 # Local caches
 EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
 EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
+EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
+EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
 API = HfApi(token=TOKEN)

src/leaderboard/read_evals.py CHANGED Viewed

@@ -103,7 +103,7 @@ class EvalResult:
             self.num_params = request.get("params", 0)
             self.date = request.get("submitted_time", "")
         except Exception:
-            print(f"Could not find request file for {self.org}/{self.model}")
     def to_dict(self):
         """Converts the Eval Result to a dict compatible with our dataframe display"""
@@ -139,8 +139,6 @@ def get_request_file_for_model(requests_path, model_name, precision):
     )
     request_files = glob.glob(request_files)
-    print("yahaa", request_files)
     # Select correct request file (precision)
     request_file = ""
     request_files = sorted(request_files, reverse=True)

             self.num_params = request.get("params", 0)
             self.date = request.get("submitted_time", "")
         except Exception:
+            print(f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}")
     def to_dict(self):
         """Converts the Eval Result to a dict compatible with our dataframe display"""
     )
     request_files = glob.glob(request_files)
     # Select correct request file (precision)
     request_file = ""
     request_files = sorted(request_files, reverse=True)

src/submission/check_validity.py CHANGED Viewed

@@ -8,7 +8,7 @@ import huggingface_hub
 from huggingface_hub import ModelCard
 from huggingface_hub.hf_api import ModelInfo
 from transformers import AutoConfig
-from transformers.models.auto.tokenization_auto import tokenizer_class_from_name, get_tokenizer_config
 def check_model_card(repo_id: str) -> tuple[bool, str]:
     """Checks if the model card and license exist and have been filled"""
@@ -31,32 +31,20 @@ def check_model_card(repo_id: str) -> tuple[bool, str]:
     return True, ""
 def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_remote_code=False, test_tokenizer=False) -> tuple[bool, str]:
-    """Makes sure the model is on the hub, and uses a valid configuration (in the latest transformers version)"""
     try:
-        print("this is input :", model_name, revision, token, trust_remote_code, test_tokenizer)
         config = AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
-        print("getting config", config)
         if test_tokenizer:
-            tokenizer_config = get_tokenizer_config(model_name)
-            print("tokenizer_config", tokenizer_config)
-            if tokenizer_config is not None:
-                tokenizer_class_candidate = tokenizer_config.get("tokenizer_class", None)
-            else:
-                tokenizer_class_candidate = config.tokenizer_class
-            tokenizer_class = tokenizer_class_from_name(tokenizer_class_candidate)
-            if tokenizer_class is None:
                 return (
                     False,
-                    f"uses {tokenizer_class_candidate}, which is not in a transformers release, therefore not supported at the moment.",
                     None
                 )
-        print(config)
         return True, None, config
     except ValueError:
@@ -67,7 +55,6 @@ def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_rem
         )
     except Exception as e:
-        print("exception is", e)
         return False, "was not found on hub!", None
@@ -107,4 +94,4 @@ def already_submitted_models(requested_models_dir: str) -> set[str]:
                     organisation, _ = info["model"].split("/")
                     users_to_submission_dates[organisation].append(info["submitted_time"])
-    return set(file_names), users_to_submission_dates

 from huggingface_hub import ModelCard
 from huggingface_hub.hf_api import ModelInfo
 from transformers import AutoConfig
+from transformers.models.auto.tokenization_auto import AutoTokenizer
 def check_model_card(repo_id: str) -> tuple[bool, str]:
     """Checks if the model card and license exist and have been filled"""
     return True, ""
 def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_remote_code=False, test_tokenizer=False) -> tuple[bool, str]:
     try:
         config = AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
         if test_tokenizer:
+            try:
+                tk = AutoTokenizer.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
+            except ValueError as e:
                 return (
                     False,
+                    f"uses a tokenizer which is not in a transformers release: {e}",
                     None
                 )
+            except Exception as e:
+                return (False, "'s tokenizer cannot be loaded. Is your tokenizer class in a stable transformers release, and correctly configured?", None)
         return True, None, config
     except ValueError:
         )
     except Exception as e:
         return False, "was not found on hub!", None
                     organisation, _ = info["model"].split("/")
                     users_to_submission_dates[organisation].append(info["submitted_time"])
+    return set(file_names), users_to_submission_dates

src/submission/submit.py CHANGED Viewed

@@ -50,7 +50,7 @@ def add_new_eval(
             return styled_error(f'Base model "{base_model}" {error}')
     if not weight_type == "Adapter":
-        model_on_hub, error, _ = is_model_on_hub(model_name=model, revision=revision, test_tokenizer=True)
         if not model_on_hub:
             return styled_error(f'Model "{model}" {error}')
@@ -87,6 +87,7 @@ def add_new_eval(
         "likes": model_info.likes,
         "params": model_size,
         "license": license,
     }
     # Check for duplicate submission

             return styled_error(f'Base model "{base_model}" {error}')
     if not weight_type == "Adapter":
+        model_on_hub, error, _ = is_model_on_hub(model_name=model, revision=revision, token=TOKEN, test_tokenizer=True)
         if not model_on_hub:
             return styled_error(f'Model "{model}" {error}')
         "likes": model_info.likes,
         "params": model_size,
         "license": license,
+        "private": False,
     }
     # Check for duplicate submission