aaditya commited on
Commit
20d5de3
1 Parent(s): 6f322a8
README.md CHANGED
@@ -12,7 +12,7 @@ license: apache-2.0
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
14
 
15
- Most of the variables to change for a default leaderboard are in env (replace the path for your leaderboard) and src/display/about.
16
 
17
  Results files should have the following format:
18
  ```
@@ -33,4 +33,8 @@ Results files should have the following format:
33
  }
34
  ```
35
 
36
- Request files are created automatically by this tool.
 
 
 
 
 
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
14
 
15
+ Most of the variables to change for a default leaderboard are in src/env (replace the path for your leaderboard) and src/about.
16
 
17
  Results files should have the following format:
18
  ```
 
33
  }
34
  ```
35
 
36
+ Request files are created automatically by this tool.
37
+
38
+ If you encounter problem on the space, don't hesitate to restart it to remove the create eval-queue, eval-queue-bk, eval-results and eval-results-bk created folder.
39
+
40
+ If you want to run your own backend, you only need to change the logic in src/backend/run_eval_suite, which at the moment launches the Eleuther AI Harness.
app.py CHANGED
@@ -1,12 +1,39 @@
 
1
  import gradio as gr
2
  import pandas as pd
3
  from apscheduler.schedulers.background import BackgroundScheduler
4
  from huggingface_hub import snapshot_download
5
 
6
- # import os
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
  # # Define the folders to delete
9
- # folders_to_delete = ['eval-results', 'eval-queue']
10
 
11
  # import shutil
12
 
@@ -29,61 +56,41 @@ from huggingface_hub import snapshot_download
29
  # # Find and kill processes running on port 7878
30
  # try:
31
  # # Find process using port 7878
32
- # output = subprocess.check_output(["lsof", "-ti", "tcp:7878"]).decode().strip()
33
  # if output:
34
  # # Split the output in case there are multiple PIDs
35
  # pids = output.split('\n')
36
  # for pid in pids:
37
  # # Kill each process
38
  # os.kill(int(pid), signal.SIGKILL)
39
- # result = "Processes running on port 7878 have been killed."
40
  # else:
41
- # result = "No processes are running on port 7878."
42
  # except Exception as e:
43
  # result = f"An error occurred: {str(e)}"
44
 
45
 
46
- from src.display.about import (
47
- CITATION_BUTTON_LABEL,
48
- CITATION_BUTTON_TEXT,
49
- EVALUATION_QUEUE_TEXT,
50
- INTRODUCTION_TEXT,
51
- LLM_BENCHMARKS_TEXT,
52
- TITLE,
53
- )
54
- from src.display.css_html_js import custom_css
55
- from src.display.utils import (
56
- BENCHMARK_COLS,
57
- COLS,
58
- EVAL_COLS,
59
- EVAL_TYPES,
60
- NUMERIC_INTERVALS,
61
- TYPES,
62
- AutoEvalColumn,
63
- ModelType,
64
- fields,
65
- WeightType,
66
- Precision
67
- )
68
- from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, TOKEN, QUEUE_REPO, REPO_ID, RESULTS_REPO
69
- from src.populate import get_evaluation_queue_df, get_leaderboard_df
70
- from src.submission.submit import add_new_eval
71
 
72
 
73
  def restart_space():
74
- API.restart_space(repo_id=REPO_ID, token=TOKEN)
 
 
 
75
 
76
  try:
77
  print(EVAL_REQUESTS_PATH)
78
  snapshot_download(
79
- repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
80
  )
81
  except Exception:
82
  restart_space()
83
  try:
84
  print(EVAL_RESULTS_PATH)
85
  snapshot_download(
86
- repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
87
  )
88
  except Exception:
89
  restart_space()
@@ -154,12 +161,12 @@ def filter_models(
154
  df: pd.DataFrame, type_query: list, size_query: list, precision_query: list, show_deleted: bool
155
  ) -> pd.DataFrame:
156
  # Show all models
157
- filtered_df = df
158
  # if show_deleted:
159
  # filtered_df = df
160
  # else: # Show only still on the hub models
161
  # filtered_df = df[df[AutoEvalColumn.still_on_hub.name] == True]
162
 
 
163
  type_emoji = [t[0] for t in type_query]
164
  filtered_df = filtered_df.loc[df[AutoEvalColumn.model_type_symbol.name].isin(type_emoji)]
165
  filtered_df = filtered_df.loc[df[AutoEvalColumn.precision.name].isin(precision_query + ["None"])]
@@ -344,7 +351,7 @@ with demo:
344
  choices=[i.value.name for i in Precision if i != Precision.Unknown],
345
  label="Precision",
346
  multiselect=False,
347
- value="float16",
348
  interactive=True,
349
  )
350
  weight_type = gr.Dropdown(
@@ -383,11 +390,12 @@ with demo:
383
 
384
  scheduler = BackgroundScheduler()
385
  scheduler.add_job(restart_space, "interval", seconds=1800)
 
386
  scheduler.start()
387
  demo.queue(default_concurrency_limit=40).launch()
388
 
389
-
390
  # scheduler = BackgroundScheduler()
391
- # scheduler.add_job(restart_space, "interval", seconds=6 * 60 * 60)
 
392
  # scheduler.start()
393
- # demo.queue().launch()
 
1
+ import subprocess
2
  import gradio as gr
3
  import pandas as pd
4
  from apscheduler.schedulers.background import BackgroundScheduler
5
  from huggingface_hub import snapshot_download
6
 
7
+ from src.about import (
8
+ CITATION_BUTTON_LABEL,
9
+ CITATION_BUTTON_TEXT,
10
+ EVALUATION_QUEUE_TEXT,
11
+ INTRODUCTION_TEXT,
12
+ LLM_BENCHMARKS_TEXT,
13
+ TITLE,
14
+ )
15
+ from src.display.css_html_js import custom_css
16
+ from src.display.utils import (
17
+ BENCHMARK_COLS,
18
+ COLS,
19
+ EVAL_COLS,
20
+ EVAL_TYPES,
21
+ NUMERIC_INTERVALS,
22
+ TYPES,
23
+ AutoEvalColumn,
24
+ ModelType,
25
+ fields,
26
+ WeightType,
27
+ Precision
28
+ )
29
+ from src.envs import API, DEVICE, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
30
+ from src.populate import get_evaluation_queue_df, get_leaderboard_df
31
+ from src.submission.submit import add_new_eval
32
+
33
+ import os
34
 
35
  # # Define the folders to delete
36
+ # folders_to_delete = ['eval-results', 'eval-queue', 'eval-queue-bk', 'eval-results-bk']
37
 
38
  # import shutil
39
 
 
56
  # # Find and kill processes running on port 7878
57
  # try:
58
  # # Find process using port 7878
59
+ # output = subprocess.check_output(["lsof", "-ti", "tcp:7862"]).decode().strip()
60
  # if output:
61
  # # Split the output in case there are multiple PIDs
62
  # pids = output.split('\n')
63
  # for pid in pids:
64
  # # Kill each process
65
  # os.kill(int(pid), signal.SIGKILL)
66
+ # result = "Processes running on port 7862 have been killed."
67
  # else:
68
+ # result = "No processes are running on port 7862."
69
  # except Exception as e:
70
  # result = f"An error occurred: {str(e)}"
71
 
72
 
73
+ subprocess.run(["python3", "scripts/fix_harness_import.py"])
74
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
 
76
 
77
  def restart_space():
78
+ API.restart_space(repo_id=REPO_ID)
79
+
80
+ def launch_backend():
81
+ _ = subprocess.run(["python3", "main_backend.py"])
82
 
83
  try:
84
  print(EVAL_REQUESTS_PATH)
85
  snapshot_download(
86
+ repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
87
  )
88
  except Exception:
89
  restart_space()
90
  try:
91
  print(EVAL_RESULTS_PATH)
92
  snapshot_download(
93
+ repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
94
  )
95
  except Exception:
96
  restart_space()
 
161
  df: pd.DataFrame, type_query: list, size_query: list, precision_query: list, show_deleted: bool
162
  ) -> pd.DataFrame:
163
  # Show all models
 
164
  # if show_deleted:
165
  # filtered_df = df
166
  # else: # Show only still on the hub models
167
  # filtered_df = df[df[AutoEvalColumn.still_on_hub.name] == True]
168
 
169
+ filtered_df = df
170
  type_emoji = [t[0] for t in type_query]
171
  filtered_df = filtered_df.loc[df[AutoEvalColumn.model_type_symbol.name].isin(type_emoji)]
172
  filtered_df = filtered_df.loc[df[AutoEvalColumn.precision.name].isin(precision_query + ["None"])]
 
351
  choices=[i.value.name for i in Precision if i != Precision.Unknown],
352
  label="Precision",
353
  multiselect=False,
354
+ value="float16" if DEVICE != "cpu" else "float32",
355
  interactive=True,
356
  )
357
  weight_type = gr.Dropdown(
 
390
 
391
  scheduler = BackgroundScheduler()
392
  scheduler.add_job(restart_space, "interval", seconds=1800)
393
+ scheduler.add_job(launch_backend, "interval", seconds=100) # will only allow one job to be run at the same time
394
  scheduler.start()
395
  demo.queue(default_concurrency_limit=40).launch()
396
 
 
397
  # scheduler = BackgroundScheduler()
398
+ # scheduler.add_job(restart_space, "interval", seconds=1800)
399
+ # scheduler.add_job(launch_backend, "interval", seconds=100) # will only allow one job to be run at the same time
400
  # scheduler.start()
401
+ # demo.queue().launch()
main_backend.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import pprint
3
+
4
+ from huggingface_hub import snapshot_download
5
+
6
+ logging.getLogger("openai").setLevel(logging.WARNING)
7
+
8
+ from src.backend.run_eval_suite import run_evaluation
9
+ from src.backend.manage_requests import check_completed_evals, get_eval_requests, set_eval_request
10
+ from src.backend.sort_queue import sort_models_by_priority
11
+
12
+ from src.envs import QUEUE_REPO, EVAL_REQUESTS_PATH_BACKEND, RESULTS_REPO, EVAL_RESULTS_PATH_BACKEND, DEVICE, API, LIMIT, TOKEN
13
+ from src.about import Tasks, NUM_FEWSHOT
14
+ TASKS_HARNESS = [task.value.benchmark for task in Tasks]
15
+
16
+ logging.basicConfig(level=logging.ERROR)
17
+ pp = pprint.PrettyPrinter(width=80)
18
+
19
+ PENDING_STATUS = "PENDING"
20
+ RUNNING_STATUS = "RUNNING"
21
+ FINISHED_STATUS = "FINISHED"
22
+ FAILED_STATUS = "FAILED"
23
+
24
+ snapshot_download(repo_id=RESULTS_REPO, revision="main", local_dir=EVAL_RESULTS_PATH_BACKEND, repo_type="dataset", max_workers=60, token=TOKEN)
25
+ snapshot_download(repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60, token=TOKEN)
26
+
27
+ def run_auto_eval():
28
+ current_pending_status = [PENDING_STATUS]
29
+
30
+ # pull the eval dataset from the hub and parse any eval requests
31
+ # check completed evals and set them to finished
32
+ check_completed_evals(
33
+ api=API,
34
+ checked_status=RUNNING_STATUS,
35
+ completed_status=FINISHED_STATUS,
36
+ failed_status=FAILED_STATUS,
37
+ hf_repo=QUEUE_REPO,
38
+ local_dir=EVAL_REQUESTS_PATH_BACKEND,
39
+ hf_repo_results=RESULTS_REPO,
40
+ local_dir_results=EVAL_RESULTS_PATH_BACKEND
41
+ )
42
+
43
+ # Get all eval request that are PENDING, if you want to run other evals, change this parameter
44
+ eval_requests = get_eval_requests(job_status=current_pending_status, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND)
45
+ # Sort the evals by priority (first submitted first run)
46
+ eval_requests = sort_models_by_priority(api=API, models=eval_requests)
47
+
48
+ print(f"Found {len(eval_requests)} {','.join(current_pending_status)} eval requests")
49
+
50
+ if len(eval_requests) == 0:
51
+ return
52
+
53
+ eval_request = eval_requests[0]
54
+ pp.pprint(eval_request)
55
+
56
+ set_eval_request(
57
+ api=API,
58
+ eval_request=eval_request,
59
+ set_to_status=RUNNING_STATUS,
60
+ hf_repo=QUEUE_REPO,
61
+ local_dir=EVAL_REQUESTS_PATH_BACKEND,
62
+ )
63
+
64
+ run_evaluation(
65
+ eval_request=eval_request,
66
+ task_names=TASKS_HARNESS,
67
+ num_fewshot=NUM_FEWSHOT,
68
+ local_dir=EVAL_RESULTS_PATH_BACKEND,
69
+ results_repo=RESULTS_REPO,
70
+ batch_size=1,
71
+ device=DEVICE,
72
+ no_cache=True,
73
+ limit=LIMIT
74
+ )
75
+
76
+
77
+ if __name__ == "__main__":
78
+ run_auto_eval()
requirements.txt CHANGED
@@ -12,4 +12,7 @@ python-dateutil==2.8.2
12
  requests==2.28.2
13
  tqdm==4.65.0
14
  transformers
15
- tokenizers>=0.15.0
 
 
 
 
12
  requests==2.28.2
13
  tqdm==4.65.0
14
  transformers
15
+ tokenizers>=0.15.0
16
+ git+https://github.com/EleutherAI/lm-evaluation-harness.git@b281b0921b636bc36ad05c0b0b0763bd6dd43463#egg=lm-eval
17
+ accelerate==0.24.1
18
+ sentencepiece
scripts/create_request_file.py CHANGED
@@ -7,11 +7,9 @@ from datetime import datetime, timezone
7
  import click
8
  from colorama import Fore
9
  from huggingface_hub import HfApi, snapshot_download
 
10
 
11
- EVAL_REQUESTS_PATH = "eval-queue"
12
- QUEUE_REPO = "open-llm-leaderboard/requests"
13
-
14
- precisions = ("float16", "bfloat16", "8bit (LLM.int8)", "4bit (QLoRA / FP4)", "GPTQ")
15
  model_types = ("pretrained", "fine-tuned", "RL-tuned", "instruction-tuned")
16
  weight_types = ("Original", "Delta", "Adapter")
17
 
@@ -36,7 +34,7 @@ def get_model_size(model_info, precision: str):
36
  def main():
37
  api = HfApi()
38
  current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
39
- snapshot_download(repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH, repo_type="dataset")
40
 
41
  model_name = click.prompt("Enter model name")
42
  revision = click.prompt("Enter revision", default="main")
 
7
  import click
8
  from colorama import Fore
9
  from huggingface_hub import HfApi, snapshot_download
10
+ from src.envs import TOKEN, EVAL_REQUESTS_PATH, QUEUE_REPO
11
 
12
+ precisions = ("float16", "bfloat16", "8bit (LLM.int8)", "4bit (QLoRA / FP4)", "GPTQ", "float32")
 
 
 
13
  model_types = ("pretrained", "fine-tuned", "RL-tuned", "instruction-tuned")
14
  weight_types = ("Original", "Delta", "Adapter")
15
 
 
34
  def main():
35
  api = HfApi()
36
  current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
37
+ snapshot_download(repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", token=TOKEN)
38
 
39
  model_name = click.prompt("Enter model name")
40
  revision = click.prompt("Enter revision", default="main")
scripts/fix_harness_import.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """This file should be used after pip install -r requirements.
2
+ It creates a folder not ported during harness package creation (as they don't use a Manifest file atm and it ignore `.json` files).
3
+ It will need to be updated if we want to use the harness' version of big bench to actually copy the json files.
4
+ """
5
+ import os
6
+
7
+ import lm_eval
8
+
9
+ if __name__ == "__main__":
10
+ lm_eval_path = lm_eval.__path__[0]
11
+ os.makedirs(os.path.join(lm_eval_path, "datasets", "bigbench_resources"), exist_ok=True)
src/{display/about.py → about.py} RENAMED
@@ -1,7 +1,5 @@
1
  from dataclasses import dataclass
2
  from enum import Enum
3
- # from src.display.utils import ModelType
4
-
5
 
6
  @dataclass
7
  class Task:
@@ -10,7 +8,8 @@ class Task:
10
  col_name: str
11
 
12
 
13
- # Init: to update with your specific keys
 
14
  class Tasks(Enum):
15
  # task_key in the json file, metric_key in the json file, name to display in the leaderboard
16
  task0 = Task("medmcqa", "acc,none", "MedMCQA")
@@ -23,18 +22,20 @@ class Tasks(Enum):
23
  task7 = Task("mmlu_professional_medicine", "acc,none", "MMLU Professional Medicine")
24
  task8 = Task("pubmedqa", "acc,none", "PubMedQA")
25
 
 
26
 
27
- # "medmcqa", "acc,none", "MedMCQA"
28
 
29
- # Your leaderboard name
30
- # <h1 align="center" style="color: #1a237e;"> Open Medical-LLM Leaderboard</h1>
 
 
 
31
  TITLE = """
32
 
33
 
34
  <div style="text-align: center; margin-bottom: 20px;">
35
  <img src="https://raw.githubusercontent.com/monk1337/MultiMedQA/main/assets/logs.png" alt="Descriptive Alt Text" style="display: block; margin: auto; height: 160px;">
36
  </div>
37
-
38
  <h1 align="center" style="color: #1a237e; font-size: 40px;">Open <span style="color: #990001;">Medical-LLM</span> Leaderboard</h1>
39
 
40
 
@@ -44,15 +45,17 @@ TITLE = """
44
  INTRODUCTION_TEXT = """
45
  🩺 The Open Medical LLM Leaderboard aims to track, rank and evaluate the performance of large language models (LLMs) on medical question answering tasks. It evaluates LLMs across a diverse array of medical datasets, including MedQA (USMLE), PubMedQA, MedMCQA, and subsets of MMLU related to medicine and biology. The leaderboard offers a comprehensive assessment of each model's medical knowledge and question answering capabilities.
46
 
 
47
  The datasets cover various aspects of medicine such as general medical knowledge, clinical knowledge, anatomy, genetics, and more. They contain multiple-choice and open-ended questions that require medical reasoning and understanding. More details on the datasets can be found in the "LLM Benchmarks Details" section below.
48
 
 
49
  The main evaluation metric used is Accuracy (ACC). Submit a model for automated evaluation on the "Submit" page. If you have comments or suggestions on additional medical datasets to include, please reach out to us in our discussion forum.
50
 
 
51
  The backend of the Open Medical LLM Leaderboard uses the Eleuther AI Language Model Evaluation Harness. More technical details can be found in the "About" page.
52
  """
53
 
54
  LLM_BENCHMARKS_TEXT = f"""
55
-
56
  <h2 style="color: #2c3e50;"> Why Leaderboard? </h2>
57
 
58
  Evaluating the medical knowledge and clinical reasoning capabilities of LLMs is crucial as they are increasingly being applied to healthcare and biomedical applications. The Open Medical LLM Leaderboard provides a platform to assess the latest LLMs on their performance on a variety of medical question answering tasks. This can help identify the strengths and gaps in medical understanding of current models.
@@ -64,18 +67,15 @@ Evaluating the medical knowledge and clinical reasoning capabilities of LLMs is
64
  <h2 style="color: #2c3e50;">About Open Life Science AI</h2>
65
  An Open Life Science Project to Benchmark and Track AI Progress, Share Models and Datasets in the Life Science Field.
66
  <a href="https://openlifescience.ai/" target="_blank"> More info </a>
67
-
68
-
69
  <h2 style="color: #2c3e50;">Datasets</h2>
70
 
71
  <div style="font-family: Arial, sans-serif; line-height: 1.6; color: #333;"> <ul style="list-style-type: none; padding: 0;"> <li style="margin-bottom: 20px;"> <h3 style="color: #2c3e50; margin-bottom: 5px;"><a href="https://arxiv.org/abs/2009.13081" target="_blank" style="color: #3498db;">MedQA (USMLE)</a></h3> <p>1273 real-world questions from the US Medical License Exams (USMLE) to test general medical knowledge</p> </li> <li style="margin-bottom: 20px;"> <h3 style="color: #2c3e50; margin-bottom: 5px;"><a href="https://arxiv.org/abs/1909.06146" target="_blank" style="color: #3498db;">PubMedQA</a></h3> <p>500 questions constructed from PubMed article titles along with the abstracts as context to test understanding of biomedical research</p> </li> <li style="margin-bottom: 20px;"> <h3 style="color: #2c3e50; margin-bottom: 5px;"><a href="https://proceedings.mlr.press/v174/pal22a.html" target="_blank" style="color: #3498db;">MedMCQA</a></h3> <p>4183 questions from Indian medical entrance exams (AIIMS & NEET PG) spanning 2.4k healthcare topics</p> </li> <li style="margin-bottom: 20px;"> <h3 style="color: #2c3e50; margin-bottom: 5px;"><a href="https://arxiv.org/abs/2009.03300" target="_blank" style="color: #3498db;">MMLU-Clinical knowledge</a></h3> <p>265 multiple choice questions on clinical knowledge</p> </li> <li style="margin-bottom: 20px;"> <h3 style="color: #2c3e50; margin-bottom: 5px;"><a href="https://arxiv.org/abs/2009.03300" target="_blank" style="color: #3498db;">MMLU-Medical genetics</a></h3> <p>100 MCQs on medical genetics</p> </li> <li style="margin-bottom: 20px;"> <h3 style="color: #2c3e50; margin-bottom: 5px;"><a href="https://arxiv.org/abs/2009.03300" target="_blank" style="color: #3498db;">MMLU-Anatomy</a></h3> <p>135 anatomy MCQs</p> </li> <li style="margin-bottom: 20px;"> <h3 style="color: #2c3e50; margin-bottom: 5px;"><a href="https://arxiv.org/abs/2009.03300" target="_blank" style="color: #3498db;">MMLU-Professional medicine</a></h3> <p>272 MCQs on professional medicine</p> </li> <li style="margin-bottom: 20px;"> <h3 style="color: #2c3e50; margin-bottom: 5px;"><a href="https://arxiv.org/abs/2009.03300" target="_blank" style="color: #3498db;">MMLU-College biology</a></h3> <p>144 MCQs on college-level biology</p> </li> <li> <h3 style="color: #2c3e50; margin-bottom: 5px;"><a href="https://arxiv.org/abs/2009.03300" target="_blank" style="color: #3498db;">MMLU-College medicine</a></h3> <p>173 college medicine MCQs</p> </li> </ul> </div>
72
 
73
- <div style="font-family: Arial, sans-serif; line-height: 1.6; color: #333;"> <h2 style="color: #2c3e50;">Evaluation Metric</h2> <p>Metric Accuracy (ACC) is used as the main evaluation metric across all datasets.</p> <h2 style="color: #2c3e50;">Details and Logs</h2> <p>Detailed results are available in the results directory:</p> <a href="https://huggingface.co/datasets/openlifescienceai/results" target="_blank" style="color: #3498db;">https://huggingface.co/datasets/openlifescienceai/results</a> <p>Input/outputs for each model can be found in the details page accessible by clicking the 📄 emoji next to the model name.</p> <h2 style="color: #2c3e50;">Reproducibility</h2> <p>To reproduce the results, you can run this evaluation script:</p> <pre style="background-color: #f0f0f0; padding: 10px; border-radius: 5px;">python eval_medical_llm.py</pre> <p>To evaluate a specific dataset on a model, use the EleutherAI LLM Evaluation Harness:</p> <pre style="background-color: #f0f0f0; padding: 10px; border-radius: 5px;">python main.py --model=hf-auto --model_args="pretrained=&lt;model&gt;,revision=&lt;revision&gt;,parallelize=True" --tasks=&lt;dataset&gt; --num_fewshot=&lt;n_shots&gt; --batch_size=1 --output_path=&lt;output_dir&gt;</pre> <p>Note some datasets may require additional setup, refer to the Evaluation Harness documentation.</p> <p>Adjust batch size based on your GPU memory if not using parallelism. Minor variations in results are expected with different batch sizes due to padding.</p> <h2 style="color: #2c3e50;">Icons</h2> <ul style="list-style-type: none; padding: 0;"> <li>🟢 Pre-trained model</li> <li>🔶 Fine-tuned model</li> <li>? Unknown model type</li> <li>⭕ Instruction-tuned</li> <li>🟦 RL-tuned</li> </ul> <p>Missing icons indicate the model info is not yet added, feel free to open an issue to include it!</p> </div>
74
 
 
75
  """
76
 
77
  LLM_BENCHMARKS_DETAILS = f"""
78
-
79
  Datasets
80
  <a href="https://arxiv.org/abs/2009.13081" target="_blank">MedQA (USMLE)</a> - 1273 real-world questions from the US Medical License Exams (USMLE) to test general medical knowledge
81
  <a href="https://arxiv.org/abs/1909.06146" target="_blank">PubMedQA</a> - 500 questions constructed from PubMed article titles along with the abstracts as context to test understanding of biomedical research
@@ -93,14 +93,10 @@ Detailed results are available in the results directory: https://huggingface.co/
93
  Input/outputs for each model can be found in the details page accessible by clicking the 📄 emoji next to the model name
94
  Reproducibility
95
  To reproduce the results, you can run this evaluation script: python eval_medical_llm.py.
96
-
97
  To evaluate a specific dataset on a model, use the EleutherAI LLM Evaluation Harness:
98
-
99
  python main.py --model=hf-auto --model_args="pretrained=<model>,revision=<revision>,parallelize=True"
100
  --tasks=<dataset> --num_fewshot=<n_shots> --batch_size=1 --output_path=<output_dir>
101
-
102
  Note some datasets may require additional setup, refer to the Evaluation Harness documentation. Adjust batch size based on your GPU memory if not using parallelism. Minor variations in results are expected with different batch sizes due to padding.
103
-
104
  Icons
105
  🟢 Pre-trained model
106
  🔶 Fine-tuned model
@@ -114,28 +110,24 @@ FAQ_TEXT = """
114
  FAQ
115
  1) Submitting a model
116
  XXX
117
-
118
  2) Model results
119
  XXX
120
-
121
  3) Editing a submission
122
  XXX
123
  """
124
 
125
  EVALUATION_QUEUE_TEXT = """
126
-
127
  Evaluation Queue for the Open Medical LLM Leaderboard
128
  Models added here will be automatically evaluated.
129
 
130
  Before submitting a model
131
  1) Verify loading with AutoClasses:
132
- python
133
 
134
 
135
- Copy code
136
  from transformers import AutoConfig, AutoModel, AutoTokenizer
137
  config = AutoConfig.from_pretrained("model-name", revision=revision)
138
  model = AutoModel.from_pretrained("model-name", revision=revision)
 
139
  tokenizer = AutoTokenizer.from_pretrained("model-name", revision=revision)
140
  Debug any loading errors before submission. Make sure the model is public.
141
 
@@ -162,7 +154,6 @@ year = {2024},
162
  publisher = {Hugging Face},
163
  howpublished = "\url{https://huggingface.co/spaces/openlifescienceai/open_medical_llm_leaderboard}"
164
  }
165
-
166
  @misc{singhal2022large,
167
  title={Large Language Models Encode Clinical Knowledge},
168
  author={Karan Singhal et al.},
@@ -171,5 +162,4 @@ howpublished = "\url{https://huggingface.co/spaces/openlifescienceai/open_medica
171
  archivePrefix={arXiv},
172
  primaryClass={cs.CL}
173
  }
174
-
175
  """
 
1
  from dataclasses import dataclass
2
  from enum import Enum
 
 
3
 
4
  @dataclass
5
  class Task:
 
8
  col_name: str
9
 
10
 
11
+ # Select your tasks here
12
+ # ---------------------------------------------------
13
  class Tasks(Enum):
14
  # task_key in the json file, metric_key in the json file, name to display in the leaderboard
15
  task0 = Task("medmcqa", "acc,none", "MedMCQA")
 
22
  task7 = Task("mmlu_professional_medicine", "acc,none", "MMLU Professional Medicine")
23
  task8 = Task("pubmedqa", "acc,none", "PubMedQA")
24
 
25
+
26
 
 
27
 
28
+ NUM_FEWSHOT = 0 # Change with your few shot
29
+ # ---------------------------------------------------
30
+
31
+
32
+
33
  TITLE = """
34
 
35
 
36
  <div style="text-align: center; margin-bottom: 20px;">
37
  <img src="https://raw.githubusercontent.com/monk1337/MultiMedQA/main/assets/logs.png" alt="Descriptive Alt Text" style="display: block; margin: auto; height: 160px;">
38
  </div>
 
39
  <h1 align="center" style="color: #1a237e; font-size: 40px;">Open <span style="color: #990001;">Medical-LLM</span> Leaderboard</h1>
40
 
41
 
 
45
  INTRODUCTION_TEXT = """
46
  🩺 The Open Medical LLM Leaderboard aims to track, rank and evaluate the performance of large language models (LLMs) on medical question answering tasks. It evaluates LLMs across a diverse array of medical datasets, including MedQA (USMLE), PubMedQA, MedMCQA, and subsets of MMLU related to medicine and biology. The leaderboard offers a comprehensive assessment of each model's medical knowledge and question answering capabilities.
47
 
48
+
49
  The datasets cover various aspects of medicine such as general medical knowledge, clinical knowledge, anatomy, genetics, and more. They contain multiple-choice and open-ended questions that require medical reasoning and understanding. More details on the datasets can be found in the "LLM Benchmarks Details" section below.
50
 
51
+
52
  The main evaluation metric used is Accuracy (ACC). Submit a model for automated evaluation on the "Submit" page. If you have comments or suggestions on additional medical datasets to include, please reach out to us in our discussion forum.
53
 
54
+
55
  The backend of the Open Medical LLM Leaderboard uses the Eleuther AI Language Model Evaluation Harness. More technical details can be found in the "About" page.
56
  """
57
 
58
  LLM_BENCHMARKS_TEXT = f"""
 
59
  <h2 style="color: #2c3e50;"> Why Leaderboard? </h2>
60
 
61
  Evaluating the medical knowledge and clinical reasoning capabilities of LLMs is crucial as they are increasingly being applied to healthcare and biomedical applications. The Open Medical LLM Leaderboard provides a platform to assess the latest LLMs on their performance on a variety of medical question answering tasks. This can help identify the strengths and gaps in medical understanding of current models.
 
67
  <h2 style="color: #2c3e50;">About Open Life Science AI</h2>
68
  An Open Life Science Project to Benchmark and Track AI Progress, Share Models and Datasets in the Life Science Field.
69
  <a href="https://openlifescience.ai/" target="_blank"> More info </a>
 
 
70
  <h2 style="color: #2c3e50;">Datasets</h2>
71
 
72
  <div style="font-family: Arial, sans-serif; line-height: 1.6; color: #333;"> <ul style="list-style-type: none; padding: 0;"> <li style="margin-bottom: 20px;"> <h3 style="color: #2c3e50; margin-bottom: 5px;"><a href="https://arxiv.org/abs/2009.13081" target="_blank" style="color: #3498db;">MedQA (USMLE)</a></h3> <p>1273 real-world questions from the US Medical License Exams (USMLE) to test general medical knowledge</p> </li> <li style="margin-bottom: 20px;"> <h3 style="color: #2c3e50; margin-bottom: 5px;"><a href="https://arxiv.org/abs/1909.06146" target="_blank" style="color: #3498db;">PubMedQA</a></h3> <p>500 questions constructed from PubMed article titles along with the abstracts as context to test understanding of biomedical research</p> </li> <li style="margin-bottom: 20px;"> <h3 style="color: #2c3e50; margin-bottom: 5px;"><a href="https://proceedings.mlr.press/v174/pal22a.html" target="_blank" style="color: #3498db;">MedMCQA</a></h3> <p>4183 questions from Indian medical entrance exams (AIIMS & NEET PG) spanning 2.4k healthcare topics</p> </li> <li style="margin-bottom: 20px;"> <h3 style="color: #2c3e50; margin-bottom: 5px;"><a href="https://arxiv.org/abs/2009.03300" target="_blank" style="color: #3498db;">MMLU-Clinical knowledge</a></h3> <p>265 multiple choice questions on clinical knowledge</p> </li> <li style="margin-bottom: 20px;"> <h3 style="color: #2c3e50; margin-bottom: 5px;"><a href="https://arxiv.org/abs/2009.03300" target="_blank" style="color: #3498db;">MMLU-Medical genetics</a></h3> <p>100 MCQs on medical genetics</p> </li> <li style="margin-bottom: 20px;"> <h3 style="color: #2c3e50; margin-bottom: 5px;"><a href="https://arxiv.org/abs/2009.03300" target="_blank" style="color: #3498db;">MMLU-Anatomy</a></h3> <p>135 anatomy MCQs</p> </li> <li style="margin-bottom: 20px;"> <h3 style="color: #2c3e50; margin-bottom: 5px;"><a href="https://arxiv.org/abs/2009.03300" target="_blank" style="color: #3498db;">MMLU-Professional medicine</a></h3> <p>272 MCQs on professional medicine</p> </li> <li style="margin-bottom: 20px;"> <h3 style="color: #2c3e50; margin-bottom: 5px;"><a href="https://arxiv.org/abs/2009.03300" target="_blank" style="color: #3498db;">MMLU-College biology</a></h3> <p>144 MCQs on college-level biology</p> </li> <li> <h3 style="color: #2c3e50; margin-bottom: 5px;"><a href="https://arxiv.org/abs/2009.03300" target="_blank" style="color: #3498db;">MMLU-College medicine</a></h3> <p>173 college medicine MCQs</p> </li> </ul> </div>
73
 
 
74
 
75
+ <div style="font-family: Arial, sans-serif; line-height: 1.6; color: #333;"> <h2 style="color: #2c3e50;">Evaluation Metric</h2> <p>Metric Accuracy (ACC) is used as the main evaluation metric across all datasets.</p> <h2 style="color: #2c3e50;">Details and Logs</h2> <p>Detailed results are available in the results directory:</p> <a href="https://huggingface.co/datasets/openlifescienceai/results" target="_blank" style="color: #3498db;">https://huggingface.co/datasets/openlifescienceai/results</a> <p>Input/outputs for each model can be found in the details page accessible by clicking the 📄 emoji next to the model name.</p> <h2 style="color: #2c3e50;">Reproducibility</h2> <p>To reproduce the results, you can run this evaluation script:</p> <pre style="background-color: #f0f0f0; padding: 10px; border-radius: 5px;">python eval_medical_llm.py</pre> <p>To evaluate a specific dataset on a model, use the EleutherAI LLM Evaluation Harness:</p> <pre style="background-color: #f0f0f0; padding: 10px; border-radius: 5px;">python main.py --model=hf-auto --model_args="pretrained=&lt;model&gt;,revision=&lt;revision&gt;,parallelize=True" --tasks=&lt;dataset&gt; --num_fewshot=&lt;n_shots&gt; --batch_size=1 --output_path=&lt;output_dir&gt;</pre> <p>Note some datasets may require additional setup, refer to the Evaluation Harness documentation.</p> <p>Adjust batch size based on your GPU memory if not using parallelism. Minor variations in results are expected with different batch sizes due to padding.</p> <h2 style="color: #2c3e50;">Icons</h2> <ul style="list-style-type: none; padding: 0;"> <li>🟢 Pre-trained model</li> <li>🔶 Fine-tuned model</li> <li>? Unknown model type</li> <li>⭕ Instruction-tuned</li> <li>🟦 RL-tuned</li> </ul> <p>Missing icons indicate the model info is not yet added, feel free to open an issue to include it!</p> </div>
76
  """
77
 
78
  LLM_BENCHMARKS_DETAILS = f"""
 
79
  Datasets
80
  <a href="https://arxiv.org/abs/2009.13081" target="_blank">MedQA (USMLE)</a> - 1273 real-world questions from the US Medical License Exams (USMLE) to test general medical knowledge
81
  <a href="https://arxiv.org/abs/1909.06146" target="_blank">PubMedQA</a> - 500 questions constructed from PubMed article titles along with the abstracts as context to test understanding of biomedical research
 
93
  Input/outputs for each model can be found in the details page accessible by clicking the 📄 emoji next to the model name
94
  Reproducibility
95
  To reproduce the results, you can run this evaluation script: python eval_medical_llm.py.
 
96
  To evaluate a specific dataset on a model, use the EleutherAI LLM Evaluation Harness:
 
97
  python main.py --model=hf-auto --model_args="pretrained=<model>,revision=<revision>,parallelize=True"
98
  --tasks=<dataset> --num_fewshot=<n_shots> --batch_size=1 --output_path=<output_dir>
 
99
  Note some datasets may require additional setup, refer to the Evaluation Harness documentation. Adjust batch size based on your GPU memory if not using parallelism. Minor variations in results are expected with different batch sizes due to padding.
 
100
  Icons
101
  🟢 Pre-trained model
102
  🔶 Fine-tuned model
 
110
  FAQ
111
  1) Submitting a model
112
  XXX
 
113
  2) Model results
114
  XXX
 
115
  3) Editing a submission
116
  XXX
117
  """
118
 
119
  EVALUATION_QUEUE_TEXT = """
 
120
  Evaluation Queue for the Open Medical LLM Leaderboard
121
  Models added here will be automatically evaluated.
122
 
123
  Before submitting a model
124
  1) Verify loading with AutoClasses:
 
125
 
126
 
 
127
  from transformers import AutoConfig, AutoModel, AutoTokenizer
128
  config = AutoConfig.from_pretrained("model-name", revision=revision)
129
  model = AutoModel.from_pretrained("model-name", revision=revision)
130
+
131
  tokenizer = AutoTokenizer.from_pretrained("model-name", revision=revision)
132
  Debug any loading errors before submission. Make sure the model is public.
133
 
 
154
  publisher = {Hugging Face},
155
  howpublished = "\url{https://huggingface.co/spaces/openlifescienceai/open_medical_llm_leaderboard}"
156
  }
 
157
  @misc{singhal2022large,
158
  title={Large Language Models Encode Clinical Knowledge},
159
  author={Karan Singhal et al.},
 
162
  archivePrefix={arXiv},
163
  primaryClass={cs.CL}
164
  }
 
165
  """
src/backend/manage_requests.py ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import glob
2
+ import json
3
+ from dataclasses import dataclass
4
+ from typing import Optional
5
+
6
+ from huggingface_hub import HfApi, snapshot_download
7
+ from src.envs import TOKEN
8
+
9
+ @dataclass
10
+ class EvalRequest:
11
+ model: str
12
+ private: bool
13
+ status: str
14
+ json_filepath: str
15
+ weight_type: str = "Original"
16
+ model_type: str = "" # pretrained, finetuned, with RL
17
+ precision: str = "" # float16, bfloat16
18
+ base_model: Optional[str] = None # for adapter models
19
+ revision: str = "main" # commit
20
+ submitted_time: Optional[str] = "2022-05-18T11:40:22.519222" # random date just so that we can still order requests by date
21
+ model_type: Optional[str] = None
22
+ likes: Optional[int] = 0
23
+ params: Optional[int] = None
24
+ license: Optional[str] = ""
25
+
26
+ def get_model_args(self):
27
+ model_args = f"pretrained={self.model},revision={self.revision}"
28
+
29
+ if self.precision in ["float16", "bfloat16", "float32"]:
30
+ model_args += f",dtype={self.precision}"
31
+ # Quantized models need some added config, the install of bits and bytes, etc
32
+ #elif self.precision == "8bit":
33
+ # model_args += ",load_in_8bit=True"
34
+ #elif self.precision == "4bit":
35
+ # model_args += ",load_in_4bit=True"
36
+ #elif self.precision == "GPTQ":
37
+ # A GPTQ model does not need dtype to be specified,
38
+ # it will be inferred from the config
39
+ pass
40
+ else:
41
+ raise Exception(f"Unknown precision {self.precision}.")
42
+
43
+ return model_args
44
+
45
+
46
+ def set_eval_request(api: HfApi, eval_request: EvalRequest, set_to_status: str, hf_repo: str, local_dir: str):
47
+ """Updates a given eval request with its new status on the hub (running, completed, failed, ...)"""
48
+ json_filepath = eval_request.json_filepath
49
+
50
+ with open(json_filepath) as fp:
51
+ data = json.load(fp)
52
+
53
+ data["status"] = set_to_status
54
+
55
+ with open(json_filepath, "w") as f:
56
+ f.write(json.dumps(data))
57
+
58
+ api.upload_file(
59
+ path_or_fileobj=json_filepath,
60
+ path_in_repo=json_filepath.replace(local_dir, ""),
61
+ repo_id=hf_repo,
62
+ repo_type="dataset",
63
+ )
64
+
65
+
66
+ def get_eval_requests(job_status: list, local_dir: str, hf_repo: str) -> list[EvalRequest]:
67
+ """Get all pending evaluation requests and return a list in which private
68
+ models appearing first, followed by public models sorted by the number of
69
+ likes.
70
+
71
+ Returns:
72
+ `list[EvalRequest]`: a list of model info dicts.
73
+ """
74
+ snapshot_download(repo_id=hf_repo, revision="main", local_dir=local_dir, repo_type="dataset", max_workers=60, token=TOKEN)
75
+ json_files = glob.glob(f"{local_dir}/**/*.json", recursive=True)
76
+
77
+ eval_requests = []
78
+ for json_filepath in json_files:
79
+ with open(json_filepath) as fp:
80
+ data = json.load(fp)
81
+ if data["status"] in job_status:
82
+ data["json_filepath"] = json_filepath
83
+ eval_request = EvalRequest(**data)
84
+ eval_requests.append(eval_request)
85
+
86
+ return eval_requests
87
+
88
+
89
+ def check_completed_evals(
90
+ api: HfApi,
91
+ hf_repo: str,
92
+ local_dir: str,
93
+ checked_status: str,
94
+ completed_status: str,
95
+ failed_status: str,
96
+ hf_repo_results: str,
97
+ local_dir_results: str,
98
+ ):
99
+ """Checks if the currently running evals are completed, if yes, update their status on the hub."""
100
+ snapshot_download(repo_id=hf_repo_results, revision="main", local_dir=local_dir_results, repo_type="dataset", max_workers=60, token=TOKEN)
101
+
102
+ running_evals = get_eval_requests(checked_status, hf_repo=hf_repo, local_dir=local_dir)
103
+
104
+ for eval_request in running_evals:
105
+ model = eval_request.model
106
+ print("====================================")
107
+ print(f"Checking {model}")
108
+
109
+ output_path = model
110
+ output_file = f"{local_dir_results}/{output_path}/results*.json"
111
+ output_file_exists = len(glob.glob(output_file)) > 0
112
+
113
+ if output_file_exists:
114
+ print(
115
+ f"EXISTS output file exists for {model} setting it to {completed_status}"
116
+ )
117
+ set_eval_request(api, eval_request, completed_status, hf_repo, local_dir)
118
+ else:
119
+ print(
120
+ f"No result file found for {model} setting it to {failed_status}"
121
+ )
122
+ set_eval_request(api, eval_request, failed_status, hf_repo, local_dir)
src/backend/run_eval_suite.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import logging
4
+ from datetime import datetime
5
+
6
+ from lm_eval import tasks, evaluator, utils
7
+
8
+ from src.envs import RESULTS_REPO, API
9
+ from src.backend.manage_requests import EvalRequest
10
+
11
+ logging.getLogger("openai").setLevel(logging.WARNING)
12
+
13
+ def run_evaluation(eval_request: EvalRequest, task_names, num_fewshot, batch_size, device, local_dir: str, results_repo: str, no_cache=True, limit=None):
14
+ if limit:
15
+ print(
16
+ "WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT."
17
+ )
18
+
19
+ task_names = utils.pattern_match(task_names, tasks.ALL_TASKS)
20
+
21
+ print(f"Selected Tasks: {task_names}")
22
+
23
+ results = evaluator.simple_evaluate(
24
+ model="hf-causal-experimental", # "hf-causal"
25
+ model_args=eval_request.get_model_args(),
26
+ tasks=task_names,
27
+ num_fewshot=num_fewshot,
28
+ batch_size=batch_size,
29
+ device=device,
30
+ no_cache=no_cache,
31
+ limit=limit,
32
+ write_out=True,
33
+ output_base_path="logs"
34
+ )
35
+
36
+ results["config"]["model_dtype"] = eval_request.precision
37
+ results["config"]["model_name"] = eval_request.model
38
+ results["config"]["model_sha"] = eval_request.revision
39
+
40
+ dumped = json.dumps(results, indent=2)
41
+ print(dumped)
42
+
43
+ output_path = os.path.join(local_dir, *eval_request.model.split("/"), f"results_{datetime.now()}.json")
44
+ os.makedirs(os.path.dirname(output_path), exist_ok=True)
45
+ with open(output_path, "w") as f:
46
+ f.write(dumped)
47
+
48
+ print(evaluator.make_table(results))
49
+
50
+ API.upload_file(
51
+ path_or_fileobj=output_path,
52
+ path_in_repo=f"{eval_request.model}/results_{datetime.now()}.json",
53
+ repo_id=results_repo,
54
+ repo_type="dataset",
55
+ )
56
+
57
+ return results
src/backend/sort_queue.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from dataclasses import dataclass
3
+
4
+ from huggingface_hub import HfApi
5
+
6
+ from src.backend.manage_requests import EvalRequest
7
+
8
+
9
+ @dataclass
10
+ class ModelMetadata:
11
+ likes: int = 0
12
+ size: int = 15
13
+
14
+
15
+ def sort_models_by_priority(api: HfApi, models: list[EvalRequest]) -> list[EvalRequest]:
16
+ private_models = [model for model in models if model.private]
17
+ public_models = [model for model in models if not model.private]
18
+
19
+ return sort_by_submit_date(private_models) + sort_by_submit_date(public_models)
20
+
21
+ def sort_by_submit_date(eval_requests: list[EvalRequest]) -> list[EvalRequest]:
22
+ return sorted(eval_requests, key=lambda x: x.submitted_time, reverse=False)
23
+
24
+ def sort_by_size(eval_requests: list[EvalRequest]) -> list[EvalRequest]:
25
+ return sorted(eval_requests, key=lambda x: x.params, reverse=False)
26
+
27
+ def sort_by_likes(eval_requests: list[EvalRequest]) -> list[EvalRequest]:
28
+ return sorted(eval_requests, key=lambda x: x.likes, reverse=False)
src/display/formatting.py CHANGED
@@ -1,12 +1,3 @@
1
- import os
2
- from datetime import datetime, timezone
3
-
4
- from huggingface_hub import HfApi
5
- from huggingface_hub.hf_api import ModelInfo
6
-
7
-
8
- API = HfApi()
9
-
10
  def model_hyperlink(link, model_name):
11
  return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
12
 
 
 
 
 
 
 
 
 
 
 
1
  def model_hyperlink(link, model_name):
2
  return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
3
 
src/display/utils.py CHANGED
@@ -3,7 +3,7 @@ from enum import Enum
3
 
4
  import pandas as pd
5
 
6
- from src.display.about import Tasks
7
 
8
  def fields(raw_class):
9
  return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
@@ -94,9 +94,10 @@ class WeightType(Enum):
94
  class Precision(Enum):
95
  float16 = ModelDetails("float16")
96
  bfloat16 = ModelDetails("bfloat16")
97
- qt_8bit = ModelDetails("8bit")
98
- qt_4bit = ModelDetails("4bit")
99
- qt_GPTQ = ModelDetails("GPTQ")
 
100
  Unknown = ModelDetails("?")
101
 
102
  def from_str(precision):
@@ -104,12 +105,14 @@ class Precision(Enum):
104
  return Precision.float16
105
  if precision in ["torch.bfloat16", "bfloat16"]:
106
  return Precision.bfloat16
107
- if precision in ["8bit"]:
108
- return Precision.qt_8bit
109
- if precision in ["4bit"]:
110
- return Precision.qt_4bit
111
- if precision in ["GPTQ", "None"]:
112
- return Precision.qt_GPTQ
 
 
113
  return Precision.Unknown
114
 
115
  # Column selection
 
3
 
4
  import pandas as pd
5
 
6
+ from src.about import Tasks
7
 
8
  def fields(raw_class):
9
  return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
 
94
  class Precision(Enum):
95
  float16 = ModelDetails("float16")
96
  bfloat16 = ModelDetails("bfloat16")
97
+ float32 = ModelDetails("float32")
98
+ #qt_8bit = ModelDetails("8bit")
99
+ #qt_4bit = ModelDetails("4bit")
100
+ #qt_GPTQ = ModelDetails("GPTQ")
101
  Unknown = ModelDetails("?")
102
 
103
  def from_str(precision):
 
105
  return Precision.float16
106
  if precision in ["torch.bfloat16", "bfloat16"]:
107
  return Precision.bfloat16
108
+ if precision in ["float32"]:
109
+ return Precision.float32
110
+ #if precision in ["8bit"]:
111
+ # return Precision.qt_8bit
112
+ #if precision in ["4bit"]:
113
+ # return Precision.qt_4bit
114
+ #if precision in ["GPTQ", "None"]:
115
+ # return Precision.qt_GPTQ
116
  return Precision.Unknown
117
 
118
  # Column selection
src/envs.py CHANGED
@@ -2,19 +2,27 @@ import os
2
 
3
  from huggingface_hub import HfApi
4
 
5
- # clone / pull the lmeh eval data
6
- TOKEN = os.environ.get("TOKEN", None)
 
 
 
 
 
7
 
8
  OWNER = "openlifescienceai"
9
  REPO_ID = f"{OWNER}/open_medical_llm_leaderboard"
10
  QUEUE_REPO = f"{OWNER}/requests"
11
  RESULTS_REPO = f"{OWNER}/results"
12
 
 
 
13
  CACHE_PATH=os.getenv("HF_HOME", ".")
14
- # print("CACHE_PATH", CACHE_PATH)
15
 
16
  # Local caches
17
  EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
18
  EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
 
 
19
 
20
  API = HfApi(token=TOKEN)
 
2
 
3
  from huggingface_hub import HfApi
4
 
5
+ # Info to change for your repository
6
+ # ----------------------------------
7
+ TOKEN = os.environ.get("TOKEN") # A read/write token for your org
8
+
9
+ DEVICE = "cpu" # "cuda:0" if you add compute
10
+ LIMIT = None # !!!! Should be None for actual evaluations!!!
11
+ # ----------------------------------
12
 
13
  OWNER = "openlifescienceai"
14
  REPO_ID = f"{OWNER}/open_medical_llm_leaderboard"
15
  QUEUE_REPO = f"{OWNER}/requests"
16
  RESULTS_REPO = f"{OWNER}/results"
17
 
18
+
19
+ # If you setup a cache later, just change HF_HOME
20
  CACHE_PATH=os.getenv("HF_HOME", ".")
 
21
 
22
  # Local caches
23
  EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
24
  EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
25
+ EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
26
+ EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
27
 
28
  API = HfApi(token=TOKEN)
src/leaderboard/read_evals.py CHANGED
@@ -103,7 +103,7 @@ class EvalResult:
103
  self.num_params = request.get("params", 0)
104
  self.date = request.get("submitted_time", "")
105
  except Exception:
106
- print(f"Could not find request file for {self.org}/{self.model}")
107
 
108
  def to_dict(self):
109
  """Converts the Eval Result to a dict compatible with our dataframe display"""
@@ -139,8 +139,6 @@ def get_request_file_for_model(requests_path, model_name, precision):
139
  )
140
  request_files = glob.glob(request_files)
141
 
142
- print("yahaa", request_files)
143
-
144
  # Select correct request file (precision)
145
  request_file = ""
146
  request_files = sorted(request_files, reverse=True)
 
103
  self.num_params = request.get("params", 0)
104
  self.date = request.get("submitted_time", "")
105
  except Exception:
106
+ print(f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}")
107
 
108
  def to_dict(self):
109
  """Converts the Eval Result to a dict compatible with our dataframe display"""
 
139
  )
140
  request_files = glob.glob(request_files)
141
 
 
 
142
  # Select correct request file (precision)
143
  request_file = ""
144
  request_files = sorted(request_files, reverse=True)
src/submission/check_validity.py CHANGED
@@ -8,7 +8,7 @@ import huggingface_hub
8
  from huggingface_hub import ModelCard
9
  from huggingface_hub.hf_api import ModelInfo
10
  from transformers import AutoConfig
11
- from transformers.models.auto.tokenization_auto import tokenizer_class_from_name, get_tokenizer_config
12
 
13
  def check_model_card(repo_id: str) -> tuple[bool, str]:
14
  """Checks if the model card and license exist and have been filled"""
@@ -31,32 +31,20 @@ def check_model_card(repo_id: str) -> tuple[bool, str]:
31
 
32
  return True, ""
33
 
34
-
35
  def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_remote_code=False, test_tokenizer=False) -> tuple[bool, str]:
36
- """Makes sure the model is on the hub, and uses a valid configuration (in the latest transformers version)"""
37
  try:
38
-
39
- print("this is input :", model_name, revision, token, trust_remote_code, test_tokenizer)
40
  config = AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
41
- print("getting config", config)
42
  if test_tokenizer:
43
- tokenizer_config = get_tokenizer_config(model_name)
44
- print("tokenizer_config", tokenizer_config)
45
- if tokenizer_config is not None:
46
- tokenizer_class_candidate = tokenizer_config.get("tokenizer_class", None)
47
- else:
48
- tokenizer_class_candidate = config.tokenizer_class
49
-
50
-
51
- tokenizer_class = tokenizer_class_from_name(tokenizer_class_candidate)
52
-
53
- if tokenizer_class is None:
54
  return (
55
  False,
56
- f"uses {tokenizer_class_candidate}, which is not in a transformers release, therefore not supported at the moment.",
57
  None
58
  )
59
- print(config)
 
60
  return True, None, config
61
 
62
  except ValueError:
@@ -67,7 +55,6 @@ def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_rem
67
  )
68
 
69
  except Exception as e:
70
- print("exception is", e)
71
  return False, "was not found on hub!", None
72
 
73
 
@@ -107,4 +94,4 @@ def already_submitted_models(requested_models_dir: str) -> set[str]:
107
  organisation, _ = info["model"].split("/")
108
  users_to_submission_dates[organisation].append(info["submitted_time"])
109
 
110
- return set(file_names), users_to_submission_dates
 
8
  from huggingface_hub import ModelCard
9
  from huggingface_hub.hf_api import ModelInfo
10
  from transformers import AutoConfig
11
+ from transformers.models.auto.tokenization_auto import AutoTokenizer
12
 
13
  def check_model_card(repo_id: str) -> tuple[bool, str]:
14
  """Checks if the model card and license exist and have been filled"""
 
31
 
32
  return True, ""
33
 
 
34
  def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_remote_code=False, test_tokenizer=False) -> tuple[bool, str]:
 
35
  try:
 
 
36
  config = AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
 
37
  if test_tokenizer:
38
+ try:
39
+ tk = AutoTokenizer.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
40
+ except ValueError as e:
 
 
 
 
 
 
 
 
41
  return (
42
  False,
43
+ f"uses a tokenizer which is not in a transformers release: {e}",
44
  None
45
  )
46
+ except Exception as e:
47
+ return (False, "'s tokenizer cannot be loaded. Is your tokenizer class in a stable transformers release, and correctly configured?", None)
48
  return True, None, config
49
 
50
  except ValueError:
 
55
  )
56
 
57
  except Exception as e:
 
58
  return False, "was not found on hub!", None
59
 
60
 
 
94
  organisation, _ = info["model"].split("/")
95
  users_to_submission_dates[organisation].append(info["submitted_time"])
96
 
97
+ return set(file_names), users_to_submission_dates
src/submission/submit.py CHANGED
@@ -50,7 +50,7 @@ def add_new_eval(
50
  return styled_error(f'Base model "{base_model}" {error}')
51
 
52
  if not weight_type == "Adapter":
53
- model_on_hub, error, _ = is_model_on_hub(model_name=model, revision=revision, test_tokenizer=True)
54
  if not model_on_hub:
55
  return styled_error(f'Model "{model}" {error}')
56
 
@@ -87,6 +87,7 @@ def add_new_eval(
87
  "likes": model_info.likes,
88
  "params": model_size,
89
  "license": license,
 
90
  }
91
 
92
  # Check for duplicate submission
 
50
  return styled_error(f'Base model "{base_model}" {error}')
51
 
52
  if not weight_type == "Adapter":
53
+ model_on_hub, error, _ = is_model_on_hub(model_name=model, revision=revision, token=TOKEN, test_tokenizer=True)
54
  if not model_on_hub:
55
  return styled_error(f'Model "{model}" {error}')
56
 
 
87
  "likes": model_info.likes,
88
  "params": model_size,
89
  "license": license,
90
+ "private": False,
91
  }
92
 
93
  # Check for duplicate submission