add data
Browse files- README.md +6 -2
- app.py +46 -38
- main_backend.py +78 -0
- requirements.txt +4 -1
- scripts/create_request_file.py +3 -5
- scripts/fix_harness_import.py +11 -0
- src/{display/about.py → about.py} +13 -23
- src/backend/manage_requests.py +122 -0
- src/backend/run_eval_suite.py +57 -0
- src/backend/sort_queue.py +28 -0
- src/display/formatting.py +0 -9
- src/display/utils.py +13 -10
- src/envs.py +11 -3
- src/leaderboard/read_evals.py +1 -3
- src/submission/check_validity.py +8 -21
- src/submission/submit.py +2 -1
README.md
CHANGED
@@ -12,7 +12,7 @@ license: apache-2.0
|
|
12 |
|
13 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
14 |
|
15 |
-
Most of the variables to change for a default leaderboard are in env (replace the path for your leaderboard) and src/
|
16 |
|
17 |
Results files should have the following format:
|
18 |
```
|
@@ -33,4 +33,8 @@ Results files should have the following format:
|
|
33 |
}
|
34 |
```
|
35 |
|
36 |
-
Request files are created automatically by this tool.
|
|
|
|
|
|
|
|
|
|
12 |
|
13 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
14 |
|
15 |
+
Most of the variables to change for a default leaderboard are in src/env (replace the path for your leaderboard) and src/about.
|
16 |
|
17 |
Results files should have the following format:
|
18 |
```
|
|
|
33 |
}
|
34 |
```
|
35 |
|
36 |
+
Request files are created automatically by this tool.
|
37 |
+
|
38 |
+
If you encounter problem on the space, don't hesitate to restart it to remove the create eval-queue, eval-queue-bk, eval-results and eval-results-bk created folder.
|
39 |
+
|
40 |
+
If you want to run your own backend, you only need to change the logic in src/backend/run_eval_suite, which at the moment launches the Eleuther AI Harness.
|
app.py
CHANGED
@@ -1,12 +1,39 @@
|
|
|
|
1 |
import gradio as gr
|
2 |
import pandas as pd
|
3 |
from apscheduler.schedulers.background import BackgroundScheduler
|
4 |
from huggingface_hub import snapshot_download
|
5 |
|
6 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
|
8 |
# # Define the folders to delete
|
9 |
-
# folders_to_delete = ['eval-results', 'eval-queue']
|
10 |
|
11 |
# import shutil
|
12 |
|
@@ -29,61 +56,41 @@ from huggingface_hub import snapshot_download
|
|
29 |
# # Find and kill processes running on port 7878
|
30 |
# try:
|
31 |
# # Find process using port 7878
|
32 |
-
# output = subprocess.check_output(["lsof", "-ti", "tcp:
|
33 |
# if output:
|
34 |
# # Split the output in case there are multiple PIDs
|
35 |
# pids = output.split('\n')
|
36 |
# for pid in pids:
|
37 |
# # Kill each process
|
38 |
# os.kill(int(pid), signal.SIGKILL)
|
39 |
-
# result = "Processes running on port
|
40 |
# else:
|
41 |
-
# result = "No processes are running on port
|
42 |
# except Exception as e:
|
43 |
# result = f"An error occurred: {str(e)}"
|
44 |
|
45 |
|
46 |
-
|
47 |
-
|
48 |
-
CITATION_BUTTON_TEXT,
|
49 |
-
EVALUATION_QUEUE_TEXT,
|
50 |
-
INTRODUCTION_TEXT,
|
51 |
-
LLM_BENCHMARKS_TEXT,
|
52 |
-
TITLE,
|
53 |
-
)
|
54 |
-
from src.display.css_html_js import custom_css
|
55 |
-
from src.display.utils import (
|
56 |
-
BENCHMARK_COLS,
|
57 |
-
COLS,
|
58 |
-
EVAL_COLS,
|
59 |
-
EVAL_TYPES,
|
60 |
-
NUMERIC_INTERVALS,
|
61 |
-
TYPES,
|
62 |
-
AutoEvalColumn,
|
63 |
-
ModelType,
|
64 |
-
fields,
|
65 |
-
WeightType,
|
66 |
-
Precision
|
67 |
-
)
|
68 |
-
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, TOKEN, QUEUE_REPO, REPO_ID, RESULTS_REPO
|
69 |
-
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
70 |
-
from src.submission.submit import add_new_eval
|
71 |
|
72 |
|
73 |
def restart_space():
|
74 |
-
API.restart_space(repo_id=REPO_ID
|
|
|
|
|
|
|
75 |
|
76 |
try:
|
77 |
print(EVAL_REQUESTS_PATH)
|
78 |
snapshot_download(
|
79 |
-
repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
|
80 |
)
|
81 |
except Exception:
|
82 |
restart_space()
|
83 |
try:
|
84 |
print(EVAL_RESULTS_PATH)
|
85 |
snapshot_download(
|
86 |
-
repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
|
87 |
)
|
88 |
except Exception:
|
89 |
restart_space()
|
@@ -154,12 +161,12 @@ def filter_models(
|
|
154 |
df: pd.DataFrame, type_query: list, size_query: list, precision_query: list, show_deleted: bool
|
155 |
) -> pd.DataFrame:
|
156 |
# Show all models
|
157 |
-
filtered_df = df
|
158 |
# if show_deleted:
|
159 |
# filtered_df = df
|
160 |
# else: # Show only still on the hub models
|
161 |
# filtered_df = df[df[AutoEvalColumn.still_on_hub.name] == True]
|
162 |
|
|
|
163 |
type_emoji = [t[0] for t in type_query]
|
164 |
filtered_df = filtered_df.loc[df[AutoEvalColumn.model_type_symbol.name].isin(type_emoji)]
|
165 |
filtered_df = filtered_df.loc[df[AutoEvalColumn.precision.name].isin(precision_query + ["None"])]
|
@@ -344,7 +351,7 @@ with demo:
|
|
344 |
choices=[i.value.name for i in Precision if i != Precision.Unknown],
|
345 |
label="Precision",
|
346 |
multiselect=False,
|
347 |
-
value="float16",
|
348 |
interactive=True,
|
349 |
)
|
350 |
weight_type = gr.Dropdown(
|
@@ -383,11 +390,12 @@ with demo:
|
|
383 |
|
384 |
scheduler = BackgroundScheduler()
|
385 |
scheduler.add_job(restart_space, "interval", seconds=1800)
|
|
|
386 |
scheduler.start()
|
387 |
demo.queue(default_concurrency_limit=40).launch()
|
388 |
|
389 |
-
|
390 |
# scheduler = BackgroundScheduler()
|
391 |
-
# scheduler.add_job(restart_space, "interval", seconds=
|
|
|
392 |
# scheduler.start()
|
393 |
-
# demo.queue().launch()
|
|
|
1 |
+
import subprocess
|
2 |
import gradio as gr
|
3 |
import pandas as pd
|
4 |
from apscheduler.schedulers.background import BackgroundScheduler
|
5 |
from huggingface_hub import snapshot_download
|
6 |
|
7 |
+
from src.about import (
|
8 |
+
CITATION_BUTTON_LABEL,
|
9 |
+
CITATION_BUTTON_TEXT,
|
10 |
+
EVALUATION_QUEUE_TEXT,
|
11 |
+
INTRODUCTION_TEXT,
|
12 |
+
LLM_BENCHMARKS_TEXT,
|
13 |
+
TITLE,
|
14 |
+
)
|
15 |
+
from src.display.css_html_js import custom_css
|
16 |
+
from src.display.utils import (
|
17 |
+
BENCHMARK_COLS,
|
18 |
+
COLS,
|
19 |
+
EVAL_COLS,
|
20 |
+
EVAL_TYPES,
|
21 |
+
NUMERIC_INTERVALS,
|
22 |
+
TYPES,
|
23 |
+
AutoEvalColumn,
|
24 |
+
ModelType,
|
25 |
+
fields,
|
26 |
+
WeightType,
|
27 |
+
Precision
|
28 |
+
)
|
29 |
+
from src.envs import API, DEVICE, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
|
30 |
+
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
31 |
+
from src.submission.submit import add_new_eval
|
32 |
+
|
33 |
+
import os
|
34 |
|
35 |
# # Define the folders to delete
|
36 |
+
# folders_to_delete = ['eval-results', 'eval-queue', 'eval-queue-bk', 'eval-results-bk']
|
37 |
|
38 |
# import shutil
|
39 |
|
|
|
56 |
# # Find and kill processes running on port 7878
|
57 |
# try:
|
58 |
# # Find process using port 7878
|
59 |
+
# output = subprocess.check_output(["lsof", "-ti", "tcp:7862"]).decode().strip()
|
60 |
# if output:
|
61 |
# # Split the output in case there are multiple PIDs
|
62 |
# pids = output.split('\n')
|
63 |
# for pid in pids:
|
64 |
# # Kill each process
|
65 |
# os.kill(int(pid), signal.SIGKILL)
|
66 |
+
# result = "Processes running on port 7862 have been killed."
|
67 |
# else:
|
68 |
+
# result = "No processes are running on port 7862."
|
69 |
# except Exception as e:
|
70 |
# result = f"An error occurred: {str(e)}"
|
71 |
|
72 |
|
73 |
+
subprocess.run(["python3", "scripts/fix_harness_import.py"])
|
74 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
75 |
|
76 |
|
77 |
def restart_space():
|
78 |
+
API.restart_space(repo_id=REPO_ID)
|
79 |
+
|
80 |
+
def launch_backend():
|
81 |
+
_ = subprocess.run(["python3", "main_backend.py"])
|
82 |
|
83 |
try:
|
84 |
print(EVAL_REQUESTS_PATH)
|
85 |
snapshot_download(
|
86 |
+
repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
|
87 |
)
|
88 |
except Exception:
|
89 |
restart_space()
|
90 |
try:
|
91 |
print(EVAL_RESULTS_PATH)
|
92 |
snapshot_download(
|
93 |
+
repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
|
94 |
)
|
95 |
except Exception:
|
96 |
restart_space()
|
|
|
161 |
df: pd.DataFrame, type_query: list, size_query: list, precision_query: list, show_deleted: bool
|
162 |
) -> pd.DataFrame:
|
163 |
# Show all models
|
|
|
164 |
# if show_deleted:
|
165 |
# filtered_df = df
|
166 |
# else: # Show only still on the hub models
|
167 |
# filtered_df = df[df[AutoEvalColumn.still_on_hub.name] == True]
|
168 |
|
169 |
+
filtered_df = df
|
170 |
type_emoji = [t[0] for t in type_query]
|
171 |
filtered_df = filtered_df.loc[df[AutoEvalColumn.model_type_symbol.name].isin(type_emoji)]
|
172 |
filtered_df = filtered_df.loc[df[AutoEvalColumn.precision.name].isin(precision_query + ["None"])]
|
|
|
351 |
choices=[i.value.name for i in Precision if i != Precision.Unknown],
|
352 |
label="Precision",
|
353 |
multiselect=False,
|
354 |
+
value="float16" if DEVICE != "cpu" else "float32",
|
355 |
interactive=True,
|
356 |
)
|
357 |
weight_type = gr.Dropdown(
|
|
|
390 |
|
391 |
scheduler = BackgroundScheduler()
|
392 |
scheduler.add_job(restart_space, "interval", seconds=1800)
|
393 |
+
scheduler.add_job(launch_backend, "interval", seconds=100) # will only allow one job to be run at the same time
|
394 |
scheduler.start()
|
395 |
demo.queue(default_concurrency_limit=40).launch()
|
396 |
|
|
|
397 |
# scheduler = BackgroundScheduler()
|
398 |
+
# scheduler.add_job(restart_space, "interval", seconds=1800)
|
399 |
+
# scheduler.add_job(launch_backend, "interval", seconds=100) # will only allow one job to be run at the same time
|
400 |
# scheduler.start()
|
401 |
+
# demo.queue().launch()
|
main_backend.py
ADDED
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
import pprint
|
3 |
+
|
4 |
+
from huggingface_hub import snapshot_download
|
5 |
+
|
6 |
+
logging.getLogger("openai").setLevel(logging.WARNING)
|
7 |
+
|
8 |
+
from src.backend.run_eval_suite import run_evaluation
|
9 |
+
from src.backend.manage_requests import check_completed_evals, get_eval_requests, set_eval_request
|
10 |
+
from src.backend.sort_queue import sort_models_by_priority
|
11 |
+
|
12 |
+
from src.envs import QUEUE_REPO, EVAL_REQUESTS_PATH_BACKEND, RESULTS_REPO, EVAL_RESULTS_PATH_BACKEND, DEVICE, API, LIMIT, TOKEN
|
13 |
+
from src.about import Tasks, NUM_FEWSHOT
|
14 |
+
TASKS_HARNESS = [task.value.benchmark for task in Tasks]
|
15 |
+
|
16 |
+
logging.basicConfig(level=logging.ERROR)
|
17 |
+
pp = pprint.PrettyPrinter(width=80)
|
18 |
+
|
19 |
+
PENDING_STATUS = "PENDING"
|
20 |
+
RUNNING_STATUS = "RUNNING"
|
21 |
+
FINISHED_STATUS = "FINISHED"
|
22 |
+
FAILED_STATUS = "FAILED"
|
23 |
+
|
24 |
+
snapshot_download(repo_id=RESULTS_REPO, revision="main", local_dir=EVAL_RESULTS_PATH_BACKEND, repo_type="dataset", max_workers=60, token=TOKEN)
|
25 |
+
snapshot_download(repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60, token=TOKEN)
|
26 |
+
|
27 |
+
def run_auto_eval():
|
28 |
+
current_pending_status = [PENDING_STATUS]
|
29 |
+
|
30 |
+
# pull the eval dataset from the hub and parse any eval requests
|
31 |
+
# check completed evals and set them to finished
|
32 |
+
check_completed_evals(
|
33 |
+
api=API,
|
34 |
+
checked_status=RUNNING_STATUS,
|
35 |
+
completed_status=FINISHED_STATUS,
|
36 |
+
failed_status=FAILED_STATUS,
|
37 |
+
hf_repo=QUEUE_REPO,
|
38 |
+
local_dir=EVAL_REQUESTS_PATH_BACKEND,
|
39 |
+
hf_repo_results=RESULTS_REPO,
|
40 |
+
local_dir_results=EVAL_RESULTS_PATH_BACKEND
|
41 |
+
)
|
42 |
+
|
43 |
+
# Get all eval request that are PENDING, if you want to run other evals, change this parameter
|
44 |
+
eval_requests = get_eval_requests(job_status=current_pending_status, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND)
|
45 |
+
# Sort the evals by priority (first submitted first run)
|
46 |
+
eval_requests = sort_models_by_priority(api=API, models=eval_requests)
|
47 |
+
|
48 |
+
print(f"Found {len(eval_requests)} {','.join(current_pending_status)} eval requests")
|
49 |
+
|
50 |
+
if len(eval_requests) == 0:
|
51 |
+
return
|
52 |
+
|
53 |
+
eval_request = eval_requests[0]
|
54 |
+
pp.pprint(eval_request)
|
55 |
+
|
56 |
+
set_eval_request(
|
57 |
+
api=API,
|
58 |
+
eval_request=eval_request,
|
59 |
+
set_to_status=RUNNING_STATUS,
|
60 |
+
hf_repo=QUEUE_REPO,
|
61 |
+
local_dir=EVAL_REQUESTS_PATH_BACKEND,
|
62 |
+
)
|
63 |
+
|
64 |
+
run_evaluation(
|
65 |
+
eval_request=eval_request,
|
66 |
+
task_names=TASKS_HARNESS,
|
67 |
+
num_fewshot=NUM_FEWSHOT,
|
68 |
+
local_dir=EVAL_RESULTS_PATH_BACKEND,
|
69 |
+
results_repo=RESULTS_REPO,
|
70 |
+
batch_size=1,
|
71 |
+
device=DEVICE,
|
72 |
+
no_cache=True,
|
73 |
+
limit=LIMIT
|
74 |
+
)
|
75 |
+
|
76 |
+
|
77 |
+
if __name__ == "__main__":
|
78 |
+
run_auto_eval()
|
requirements.txt
CHANGED
@@ -12,4 +12,7 @@ python-dateutil==2.8.2
|
|
12 |
requests==2.28.2
|
13 |
tqdm==4.65.0
|
14 |
transformers
|
15 |
-
tokenizers>=0.15.0
|
|
|
|
|
|
|
|
12 |
requests==2.28.2
|
13 |
tqdm==4.65.0
|
14 |
transformers
|
15 |
+
tokenizers>=0.15.0
|
16 |
+
git+https://github.com/EleutherAI/lm-evaluation-harness.git@b281b0921b636bc36ad05c0b0b0763bd6dd43463#egg=lm-eval
|
17 |
+
accelerate==0.24.1
|
18 |
+
sentencepiece
|
scripts/create_request_file.py
CHANGED
@@ -7,11 +7,9 @@ from datetime import datetime, timezone
|
|
7 |
import click
|
8 |
from colorama import Fore
|
9 |
from huggingface_hub import HfApi, snapshot_download
|
|
|
10 |
|
11 |
-
|
12 |
-
QUEUE_REPO = "open-llm-leaderboard/requests"
|
13 |
-
|
14 |
-
precisions = ("float16", "bfloat16", "8bit (LLM.int8)", "4bit (QLoRA / FP4)", "GPTQ")
|
15 |
model_types = ("pretrained", "fine-tuned", "RL-tuned", "instruction-tuned")
|
16 |
weight_types = ("Original", "Delta", "Adapter")
|
17 |
|
@@ -36,7 +34,7 @@ def get_model_size(model_info, precision: str):
|
|
36 |
def main():
|
37 |
api = HfApi()
|
38 |
current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
39 |
-
snapshot_download(repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH, repo_type="dataset")
|
40 |
|
41 |
model_name = click.prompt("Enter model name")
|
42 |
revision = click.prompt("Enter revision", default="main")
|
|
|
7 |
import click
|
8 |
from colorama import Fore
|
9 |
from huggingface_hub import HfApi, snapshot_download
|
10 |
+
from src.envs import TOKEN, EVAL_REQUESTS_PATH, QUEUE_REPO
|
11 |
|
12 |
+
precisions = ("float16", "bfloat16", "8bit (LLM.int8)", "4bit (QLoRA / FP4)", "GPTQ", "float32")
|
|
|
|
|
|
|
13 |
model_types = ("pretrained", "fine-tuned", "RL-tuned", "instruction-tuned")
|
14 |
weight_types = ("Original", "Delta", "Adapter")
|
15 |
|
|
|
34 |
def main():
|
35 |
api = HfApi()
|
36 |
current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
37 |
+
snapshot_download(repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", token=TOKEN)
|
38 |
|
39 |
model_name = click.prompt("Enter model name")
|
40 |
revision = click.prompt("Enter revision", default="main")
|
scripts/fix_harness_import.py
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""This file should be used after pip install -r requirements.
|
2 |
+
It creates a folder not ported during harness package creation (as they don't use a Manifest file atm and it ignore `.json` files).
|
3 |
+
It will need to be updated if we want to use the harness' version of big bench to actually copy the json files.
|
4 |
+
"""
|
5 |
+
import os
|
6 |
+
|
7 |
+
import lm_eval
|
8 |
+
|
9 |
+
if __name__ == "__main__":
|
10 |
+
lm_eval_path = lm_eval.__path__[0]
|
11 |
+
os.makedirs(os.path.join(lm_eval_path, "datasets", "bigbench_resources"), exist_ok=True)
|
src/{display/about.py → about.py}
RENAMED
@@ -1,7 +1,5 @@
|
|
1 |
from dataclasses import dataclass
|
2 |
from enum import Enum
|
3 |
-
# from src.display.utils import ModelType
|
4 |
-
|
5 |
|
6 |
@dataclass
|
7 |
class Task:
|
@@ -10,7 +8,8 @@ class Task:
|
|
10 |
col_name: str
|
11 |
|
12 |
|
13 |
-
#
|
|
|
14 |
class Tasks(Enum):
|
15 |
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
16 |
task0 = Task("medmcqa", "acc,none", "MedMCQA")
|
@@ -23,18 +22,20 @@ class Tasks(Enum):
|
|
23 |
task7 = Task("mmlu_professional_medicine", "acc,none", "MMLU Professional Medicine")
|
24 |
task8 = Task("pubmedqa", "acc,none", "PubMedQA")
|
25 |
|
|
|
26 |
|
27 |
-
# "medmcqa", "acc,none", "MedMCQA"
|
28 |
|
29 |
-
#
|
30 |
-
#
|
|
|
|
|
|
|
31 |
TITLE = """
|
32 |
|
33 |
|
34 |
<div style="text-align: center; margin-bottom: 20px;">
|
35 |
<img src="https://raw.githubusercontent.com/monk1337/MultiMedQA/main/assets/logs.png" alt="Descriptive Alt Text" style="display: block; margin: auto; height: 160px;">
|
36 |
</div>
|
37 |
-
|
38 |
<h1 align="center" style="color: #1a237e; font-size: 40px;">Open <span style="color: #990001;">Medical-LLM</span> Leaderboard</h1>
|
39 |
|
40 |
|
@@ -44,15 +45,17 @@ TITLE = """
|
|
44 |
INTRODUCTION_TEXT = """
|
45 |
🩺 The Open Medical LLM Leaderboard aims to track, rank and evaluate the performance of large language models (LLMs) on medical question answering tasks. It evaluates LLMs across a diverse array of medical datasets, including MedQA (USMLE), PubMedQA, MedMCQA, and subsets of MMLU related to medicine and biology. The leaderboard offers a comprehensive assessment of each model's medical knowledge and question answering capabilities.
|
46 |
|
|
|
47 |
The datasets cover various aspects of medicine such as general medical knowledge, clinical knowledge, anatomy, genetics, and more. They contain multiple-choice and open-ended questions that require medical reasoning and understanding. More details on the datasets can be found in the "LLM Benchmarks Details" section below.
|
48 |
|
|
|
49 |
The main evaluation metric used is Accuracy (ACC). Submit a model for automated evaluation on the "Submit" page. If you have comments or suggestions on additional medical datasets to include, please reach out to us in our discussion forum.
|
50 |
|
|
|
51 |
The backend of the Open Medical LLM Leaderboard uses the Eleuther AI Language Model Evaluation Harness. More technical details can be found in the "About" page.
|
52 |
"""
|
53 |
|
54 |
LLM_BENCHMARKS_TEXT = f"""
|
55 |
-
|
56 |
<h2 style="color: #2c3e50;"> Why Leaderboard? </h2>
|
57 |
|
58 |
Evaluating the medical knowledge and clinical reasoning capabilities of LLMs is crucial as they are increasingly being applied to healthcare and biomedical applications. The Open Medical LLM Leaderboard provides a platform to assess the latest LLMs on their performance on a variety of medical question answering tasks. This can help identify the strengths and gaps in medical understanding of current models.
|
@@ -64,18 +67,15 @@ Evaluating the medical knowledge and clinical reasoning capabilities of LLMs is
|
|
64 |
<h2 style="color: #2c3e50;">About Open Life Science AI</h2>
|
65 |
An Open Life Science Project to Benchmark and Track AI Progress, Share Models and Datasets in the Life Science Field.
|
66 |
<a href="https://openlifescience.ai/" target="_blank"> More info </a>
|
67 |
-
|
68 |
-
|
69 |
<h2 style="color: #2c3e50;">Datasets</h2>
|
70 |
|
71 |
<div style="font-family: Arial, sans-serif; line-height: 1.6; color: #333;"> <ul style="list-style-type: none; padding: 0;"> <li style="margin-bottom: 20px;"> <h3 style="color: #2c3e50; margin-bottom: 5px;"><a href="https://arxiv.org/abs/2009.13081" target="_blank" style="color: #3498db;">MedQA (USMLE)</a></h3> <p>1273 real-world questions from the US Medical License Exams (USMLE) to test general medical knowledge</p> </li> <li style="margin-bottom: 20px;"> <h3 style="color: #2c3e50; margin-bottom: 5px;"><a href="https://arxiv.org/abs/1909.06146" target="_blank" style="color: #3498db;">PubMedQA</a></h3> <p>500 questions constructed from PubMed article titles along with the abstracts as context to test understanding of biomedical research</p> </li> <li style="margin-bottom: 20px;"> <h3 style="color: #2c3e50; margin-bottom: 5px;"><a href="https://proceedings.mlr.press/v174/pal22a.html" target="_blank" style="color: #3498db;">MedMCQA</a></h3> <p>4183 questions from Indian medical entrance exams (AIIMS & NEET PG) spanning 2.4k healthcare topics</p> </li> <li style="margin-bottom: 20px;"> <h3 style="color: #2c3e50; margin-bottom: 5px;"><a href="https://arxiv.org/abs/2009.03300" target="_blank" style="color: #3498db;">MMLU-Clinical knowledge</a></h3> <p>265 multiple choice questions on clinical knowledge</p> </li> <li style="margin-bottom: 20px;"> <h3 style="color: #2c3e50; margin-bottom: 5px;"><a href="https://arxiv.org/abs/2009.03300" target="_blank" style="color: #3498db;">MMLU-Medical genetics</a></h3> <p>100 MCQs on medical genetics</p> </li> <li style="margin-bottom: 20px;"> <h3 style="color: #2c3e50; margin-bottom: 5px;"><a href="https://arxiv.org/abs/2009.03300" target="_blank" style="color: #3498db;">MMLU-Anatomy</a></h3> <p>135 anatomy MCQs</p> </li> <li style="margin-bottom: 20px;"> <h3 style="color: #2c3e50; margin-bottom: 5px;"><a href="https://arxiv.org/abs/2009.03300" target="_blank" style="color: #3498db;">MMLU-Professional medicine</a></h3> <p>272 MCQs on professional medicine</p> </li> <li style="margin-bottom: 20px;"> <h3 style="color: #2c3e50; margin-bottom: 5px;"><a href="https://arxiv.org/abs/2009.03300" target="_blank" style="color: #3498db;">MMLU-College biology</a></h3> <p>144 MCQs on college-level biology</p> </li> <li> <h3 style="color: #2c3e50; margin-bottom: 5px;"><a href="https://arxiv.org/abs/2009.03300" target="_blank" style="color: #3498db;">MMLU-College medicine</a></h3> <p>173 college medicine MCQs</p> </li> </ul> </div>
|
72 |
|
73 |
-
<div style="font-family: Arial, sans-serif; line-height: 1.6; color: #333;"> <h2 style="color: #2c3e50;">Evaluation Metric</h2> <p>Metric Accuracy (ACC) is used as the main evaluation metric across all datasets.</p> <h2 style="color: #2c3e50;">Details and Logs</h2> <p>Detailed results are available in the results directory:</p> <a href="https://huggingface.co/datasets/openlifescienceai/results" target="_blank" style="color: #3498db;">https://huggingface.co/datasets/openlifescienceai/results</a> <p>Input/outputs for each model can be found in the details page accessible by clicking the 📄 emoji next to the model name.</p> <h2 style="color: #2c3e50;">Reproducibility</h2> <p>To reproduce the results, you can run this evaluation script:</p> <pre style="background-color: #f0f0f0; padding: 10px; border-radius: 5px;">python eval_medical_llm.py</pre> <p>To evaluate a specific dataset on a model, use the EleutherAI LLM Evaluation Harness:</p> <pre style="background-color: #f0f0f0; padding: 10px; border-radius: 5px;">python main.py --model=hf-auto --model_args="pretrained=<model>,revision=<revision>,parallelize=True" --tasks=<dataset> --num_fewshot=<n_shots> --batch_size=1 --output_path=<output_dir></pre> <p>Note some datasets may require additional setup, refer to the Evaluation Harness documentation.</p> <p>Adjust batch size based on your GPU memory if not using parallelism. Minor variations in results are expected with different batch sizes due to padding.</p> <h2 style="color: #2c3e50;">Icons</h2> <ul style="list-style-type: none; padding: 0;"> <li>🟢 Pre-trained model</li> <li>🔶 Fine-tuned model</li> <li>? Unknown model type</li> <li>⭕ Instruction-tuned</li> <li>🟦 RL-tuned</li> </ul> <p>Missing icons indicate the model info is not yet added, feel free to open an issue to include it!</p> </div>
|
74 |
|
|
|
75 |
"""
|
76 |
|
77 |
LLM_BENCHMARKS_DETAILS = f"""
|
78 |
-
|
79 |
Datasets
|
80 |
<a href="https://arxiv.org/abs/2009.13081" target="_blank">MedQA (USMLE)</a> - 1273 real-world questions from the US Medical License Exams (USMLE) to test general medical knowledge
|
81 |
<a href="https://arxiv.org/abs/1909.06146" target="_blank">PubMedQA</a> - 500 questions constructed from PubMed article titles along with the abstracts as context to test understanding of biomedical research
|
@@ -93,14 +93,10 @@ Detailed results are available in the results directory: https://huggingface.co/
|
|
93 |
Input/outputs for each model can be found in the details page accessible by clicking the 📄 emoji next to the model name
|
94 |
Reproducibility
|
95 |
To reproduce the results, you can run this evaluation script: python eval_medical_llm.py.
|
96 |
-
|
97 |
To evaluate a specific dataset on a model, use the EleutherAI LLM Evaluation Harness:
|
98 |
-
|
99 |
python main.py --model=hf-auto --model_args="pretrained=<model>,revision=<revision>,parallelize=True"
|
100 |
--tasks=<dataset> --num_fewshot=<n_shots> --batch_size=1 --output_path=<output_dir>
|
101 |
-
|
102 |
Note some datasets may require additional setup, refer to the Evaluation Harness documentation. Adjust batch size based on your GPU memory if not using parallelism. Minor variations in results are expected with different batch sizes due to padding.
|
103 |
-
|
104 |
Icons
|
105 |
🟢 Pre-trained model
|
106 |
🔶 Fine-tuned model
|
@@ -114,28 +110,24 @@ FAQ_TEXT = """
|
|
114 |
FAQ
|
115 |
1) Submitting a model
|
116 |
XXX
|
117 |
-
|
118 |
2) Model results
|
119 |
XXX
|
120 |
-
|
121 |
3) Editing a submission
|
122 |
XXX
|
123 |
"""
|
124 |
|
125 |
EVALUATION_QUEUE_TEXT = """
|
126 |
-
|
127 |
Evaluation Queue for the Open Medical LLM Leaderboard
|
128 |
Models added here will be automatically evaluated.
|
129 |
|
130 |
Before submitting a model
|
131 |
1) Verify loading with AutoClasses:
|
132 |
-
python
|
133 |
|
134 |
|
135 |
-
Copy code
|
136 |
from transformers import AutoConfig, AutoModel, AutoTokenizer
|
137 |
config = AutoConfig.from_pretrained("model-name", revision=revision)
|
138 |
model = AutoModel.from_pretrained("model-name", revision=revision)
|
|
|
139 |
tokenizer = AutoTokenizer.from_pretrained("model-name", revision=revision)
|
140 |
Debug any loading errors before submission. Make sure the model is public.
|
141 |
|
@@ -162,7 +154,6 @@ year = {2024},
|
|
162 |
publisher = {Hugging Face},
|
163 |
howpublished = "\url{https://huggingface.co/spaces/openlifescienceai/open_medical_llm_leaderboard}"
|
164 |
}
|
165 |
-
|
166 |
@misc{singhal2022large,
|
167 |
title={Large Language Models Encode Clinical Knowledge},
|
168 |
author={Karan Singhal et al.},
|
@@ -171,5 +162,4 @@ howpublished = "\url{https://huggingface.co/spaces/openlifescienceai/open_medica
|
|
171 |
archivePrefix={arXiv},
|
172 |
primaryClass={cs.CL}
|
173 |
}
|
174 |
-
|
175 |
"""
|
|
|
1 |
from dataclasses import dataclass
|
2 |
from enum import Enum
|
|
|
|
|
3 |
|
4 |
@dataclass
|
5 |
class Task:
|
|
|
8 |
col_name: str
|
9 |
|
10 |
|
11 |
+
# Select your tasks here
|
12 |
+
# ---------------------------------------------------
|
13 |
class Tasks(Enum):
|
14 |
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
15 |
task0 = Task("medmcqa", "acc,none", "MedMCQA")
|
|
|
22 |
task7 = Task("mmlu_professional_medicine", "acc,none", "MMLU Professional Medicine")
|
23 |
task8 = Task("pubmedqa", "acc,none", "PubMedQA")
|
24 |
|
25 |
+
|
26 |
|
|
|
27 |
|
28 |
+
NUM_FEWSHOT = 0 # Change with your few shot
|
29 |
+
# ---------------------------------------------------
|
30 |
+
|
31 |
+
|
32 |
+
|
33 |
TITLE = """
|
34 |
|
35 |
|
36 |
<div style="text-align: center; margin-bottom: 20px;">
|
37 |
<img src="https://raw.githubusercontent.com/monk1337/MultiMedQA/main/assets/logs.png" alt="Descriptive Alt Text" style="display: block; margin: auto; height: 160px;">
|
38 |
</div>
|
|
|
39 |
<h1 align="center" style="color: #1a237e; font-size: 40px;">Open <span style="color: #990001;">Medical-LLM</span> Leaderboard</h1>
|
40 |
|
41 |
|
|
|
45 |
INTRODUCTION_TEXT = """
|
46 |
🩺 The Open Medical LLM Leaderboard aims to track, rank and evaluate the performance of large language models (LLMs) on medical question answering tasks. It evaluates LLMs across a diverse array of medical datasets, including MedQA (USMLE), PubMedQA, MedMCQA, and subsets of MMLU related to medicine and biology. The leaderboard offers a comprehensive assessment of each model's medical knowledge and question answering capabilities.
|
47 |
|
48 |
+
|
49 |
The datasets cover various aspects of medicine such as general medical knowledge, clinical knowledge, anatomy, genetics, and more. They contain multiple-choice and open-ended questions that require medical reasoning and understanding. More details on the datasets can be found in the "LLM Benchmarks Details" section below.
|
50 |
|
51 |
+
|
52 |
The main evaluation metric used is Accuracy (ACC). Submit a model for automated evaluation on the "Submit" page. If you have comments or suggestions on additional medical datasets to include, please reach out to us in our discussion forum.
|
53 |
|
54 |
+
|
55 |
The backend of the Open Medical LLM Leaderboard uses the Eleuther AI Language Model Evaluation Harness. More technical details can be found in the "About" page.
|
56 |
"""
|
57 |
|
58 |
LLM_BENCHMARKS_TEXT = f"""
|
|
|
59 |
<h2 style="color: #2c3e50;"> Why Leaderboard? </h2>
|
60 |
|
61 |
Evaluating the medical knowledge and clinical reasoning capabilities of LLMs is crucial as they are increasingly being applied to healthcare and biomedical applications. The Open Medical LLM Leaderboard provides a platform to assess the latest LLMs on their performance on a variety of medical question answering tasks. This can help identify the strengths and gaps in medical understanding of current models.
|
|
|
67 |
<h2 style="color: #2c3e50;">About Open Life Science AI</h2>
|
68 |
An Open Life Science Project to Benchmark and Track AI Progress, Share Models and Datasets in the Life Science Field.
|
69 |
<a href="https://openlifescience.ai/" target="_blank"> More info </a>
|
|
|
|
|
70 |
<h2 style="color: #2c3e50;">Datasets</h2>
|
71 |
|
72 |
<div style="font-family: Arial, sans-serif; line-height: 1.6; color: #333;"> <ul style="list-style-type: none; padding: 0;"> <li style="margin-bottom: 20px;"> <h3 style="color: #2c3e50; margin-bottom: 5px;"><a href="https://arxiv.org/abs/2009.13081" target="_blank" style="color: #3498db;">MedQA (USMLE)</a></h3> <p>1273 real-world questions from the US Medical License Exams (USMLE) to test general medical knowledge</p> </li> <li style="margin-bottom: 20px;"> <h3 style="color: #2c3e50; margin-bottom: 5px;"><a href="https://arxiv.org/abs/1909.06146" target="_blank" style="color: #3498db;">PubMedQA</a></h3> <p>500 questions constructed from PubMed article titles along with the abstracts as context to test understanding of biomedical research</p> </li> <li style="margin-bottom: 20px;"> <h3 style="color: #2c3e50; margin-bottom: 5px;"><a href="https://proceedings.mlr.press/v174/pal22a.html" target="_blank" style="color: #3498db;">MedMCQA</a></h3> <p>4183 questions from Indian medical entrance exams (AIIMS & NEET PG) spanning 2.4k healthcare topics</p> </li> <li style="margin-bottom: 20px;"> <h3 style="color: #2c3e50; margin-bottom: 5px;"><a href="https://arxiv.org/abs/2009.03300" target="_blank" style="color: #3498db;">MMLU-Clinical knowledge</a></h3> <p>265 multiple choice questions on clinical knowledge</p> </li> <li style="margin-bottom: 20px;"> <h3 style="color: #2c3e50; margin-bottom: 5px;"><a href="https://arxiv.org/abs/2009.03300" target="_blank" style="color: #3498db;">MMLU-Medical genetics</a></h3> <p>100 MCQs on medical genetics</p> </li> <li style="margin-bottom: 20px;"> <h3 style="color: #2c3e50; margin-bottom: 5px;"><a href="https://arxiv.org/abs/2009.03300" target="_blank" style="color: #3498db;">MMLU-Anatomy</a></h3> <p>135 anatomy MCQs</p> </li> <li style="margin-bottom: 20px;"> <h3 style="color: #2c3e50; margin-bottom: 5px;"><a href="https://arxiv.org/abs/2009.03300" target="_blank" style="color: #3498db;">MMLU-Professional medicine</a></h3> <p>272 MCQs on professional medicine</p> </li> <li style="margin-bottom: 20px;"> <h3 style="color: #2c3e50; margin-bottom: 5px;"><a href="https://arxiv.org/abs/2009.03300" target="_blank" style="color: #3498db;">MMLU-College biology</a></h3> <p>144 MCQs on college-level biology</p> </li> <li> <h3 style="color: #2c3e50; margin-bottom: 5px;"><a href="https://arxiv.org/abs/2009.03300" target="_blank" style="color: #3498db;">MMLU-College medicine</a></h3> <p>173 college medicine MCQs</p> </li> </ul> </div>
|
73 |
|
|
|
74 |
|
75 |
+
<div style="font-family: Arial, sans-serif; line-height: 1.6; color: #333;"> <h2 style="color: #2c3e50;">Evaluation Metric</h2> <p>Metric Accuracy (ACC) is used as the main evaluation metric across all datasets.</p> <h2 style="color: #2c3e50;">Details and Logs</h2> <p>Detailed results are available in the results directory:</p> <a href="https://huggingface.co/datasets/openlifescienceai/results" target="_blank" style="color: #3498db;">https://huggingface.co/datasets/openlifescienceai/results</a> <p>Input/outputs for each model can be found in the details page accessible by clicking the 📄 emoji next to the model name.</p> <h2 style="color: #2c3e50;">Reproducibility</h2> <p>To reproduce the results, you can run this evaluation script:</p> <pre style="background-color: #f0f0f0; padding: 10px; border-radius: 5px;">python eval_medical_llm.py</pre> <p>To evaluate a specific dataset on a model, use the EleutherAI LLM Evaluation Harness:</p> <pre style="background-color: #f0f0f0; padding: 10px; border-radius: 5px;">python main.py --model=hf-auto --model_args="pretrained=<model>,revision=<revision>,parallelize=True" --tasks=<dataset> --num_fewshot=<n_shots> --batch_size=1 --output_path=<output_dir></pre> <p>Note some datasets may require additional setup, refer to the Evaluation Harness documentation.</p> <p>Adjust batch size based on your GPU memory if not using parallelism. Minor variations in results are expected with different batch sizes due to padding.</p> <h2 style="color: #2c3e50;">Icons</h2> <ul style="list-style-type: none; padding: 0;"> <li>🟢 Pre-trained model</li> <li>🔶 Fine-tuned model</li> <li>? Unknown model type</li> <li>⭕ Instruction-tuned</li> <li>🟦 RL-tuned</li> </ul> <p>Missing icons indicate the model info is not yet added, feel free to open an issue to include it!</p> </div>
|
76 |
"""
|
77 |
|
78 |
LLM_BENCHMARKS_DETAILS = f"""
|
|
|
79 |
Datasets
|
80 |
<a href="https://arxiv.org/abs/2009.13081" target="_blank">MedQA (USMLE)</a> - 1273 real-world questions from the US Medical License Exams (USMLE) to test general medical knowledge
|
81 |
<a href="https://arxiv.org/abs/1909.06146" target="_blank">PubMedQA</a> - 500 questions constructed from PubMed article titles along with the abstracts as context to test understanding of biomedical research
|
|
|
93 |
Input/outputs for each model can be found in the details page accessible by clicking the 📄 emoji next to the model name
|
94 |
Reproducibility
|
95 |
To reproduce the results, you can run this evaluation script: python eval_medical_llm.py.
|
|
|
96 |
To evaluate a specific dataset on a model, use the EleutherAI LLM Evaluation Harness:
|
|
|
97 |
python main.py --model=hf-auto --model_args="pretrained=<model>,revision=<revision>,parallelize=True"
|
98 |
--tasks=<dataset> --num_fewshot=<n_shots> --batch_size=1 --output_path=<output_dir>
|
|
|
99 |
Note some datasets may require additional setup, refer to the Evaluation Harness documentation. Adjust batch size based on your GPU memory if not using parallelism. Minor variations in results are expected with different batch sizes due to padding.
|
|
|
100 |
Icons
|
101 |
🟢 Pre-trained model
|
102 |
🔶 Fine-tuned model
|
|
|
110 |
FAQ
|
111 |
1) Submitting a model
|
112 |
XXX
|
|
|
113 |
2) Model results
|
114 |
XXX
|
|
|
115 |
3) Editing a submission
|
116 |
XXX
|
117 |
"""
|
118 |
|
119 |
EVALUATION_QUEUE_TEXT = """
|
|
|
120 |
Evaluation Queue for the Open Medical LLM Leaderboard
|
121 |
Models added here will be automatically evaluated.
|
122 |
|
123 |
Before submitting a model
|
124 |
1) Verify loading with AutoClasses:
|
|
|
125 |
|
126 |
|
|
|
127 |
from transformers import AutoConfig, AutoModel, AutoTokenizer
|
128 |
config = AutoConfig.from_pretrained("model-name", revision=revision)
|
129 |
model = AutoModel.from_pretrained("model-name", revision=revision)
|
130 |
+
|
131 |
tokenizer = AutoTokenizer.from_pretrained("model-name", revision=revision)
|
132 |
Debug any loading errors before submission. Make sure the model is public.
|
133 |
|
|
|
154 |
publisher = {Hugging Face},
|
155 |
howpublished = "\url{https://huggingface.co/spaces/openlifescienceai/open_medical_llm_leaderboard}"
|
156 |
}
|
|
|
157 |
@misc{singhal2022large,
|
158 |
title={Large Language Models Encode Clinical Knowledge},
|
159 |
author={Karan Singhal et al.},
|
|
|
162 |
archivePrefix={arXiv},
|
163 |
primaryClass={cs.CL}
|
164 |
}
|
|
|
165 |
"""
|
src/backend/manage_requests.py
ADDED
@@ -0,0 +1,122 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import glob
|
2 |
+
import json
|
3 |
+
from dataclasses import dataclass
|
4 |
+
from typing import Optional
|
5 |
+
|
6 |
+
from huggingface_hub import HfApi, snapshot_download
|
7 |
+
from src.envs import TOKEN
|
8 |
+
|
9 |
+
@dataclass
|
10 |
+
class EvalRequest:
|
11 |
+
model: str
|
12 |
+
private: bool
|
13 |
+
status: str
|
14 |
+
json_filepath: str
|
15 |
+
weight_type: str = "Original"
|
16 |
+
model_type: str = "" # pretrained, finetuned, with RL
|
17 |
+
precision: str = "" # float16, bfloat16
|
18 |
+
base_model: Optional[str] = None # for adapter models
|
19 |
+
revision: str = "main" # commit
|
20 |
+
submitted_time: Optional[str] = "2022-05-18T11:40:22.519222" # random date just so that we can still order requests by date
|
21 |
+
model_type: Optional[str] = None
|
22 |
+
likes: Optional[int] = 0
|
23 |
+
params: Optional[int] = None
|
24 |
+
license: Optional[str] = ""
|
25 |
+
|
26 |
+
def get_model_args(self):
|
27 |
+
model_args = f"pretrained={self.model},revision={self.revision}"
|
28 |
+
|
29 |
+
if self.precision in ["float16", "bfloat16", "float32"]:
|
30 |
+
model_args += f",dtype={self.precision}"
|
31 |
+
# Quantized models need some added config, the install of bits and bytes, etc
|
32 |
+
#elif self.precision == "8bit":
|
33 |
+
# model_args += ",load_in_8bit=True"
|
34 |
+
#elif self.precision == "4bit":
|
35 |
+
# model_args += ",load_in_4bit=True"
|
36 |
+
#elif self.precision == "GPTQ":
|
37 |
+
# A GPTQ model does not need dtype to be specified,
|
38 |
+
# it will be inferred from the config
|
39 |
+
pass
|
40 |
+
else:
|
41 |
+
raise Exception(f"Unknown precision {self.precision}.")
|
42 |
+
|
43 |
+
return model_args
|
44 |
+
|
45 |
+
|
46 |
+
def set_eval_request(api: HfApi, eval_request: EvalRequest, set_to_status: str, hf_repo: str, local_dir: str):
|
47 |
+
"""Updates a given eval request with its new status on the hub (running, completed, failed, ...)"""
|
48 |
+
json_filepath = eval_request.json_filepath
|
49 |
+
|
50 |
+
with open(json_filepath) as fp:
|
51 |
+
data = json.load(fp)
|
52 |
+
|
53 |
+
data["status"] = set_to_status
|
54 |
+
|
55 |
+
with open(json_filepath, "w") as f:
|
56 |
+
f.write(json.dumps(data))
|
57 |
+
|
58 |
+
api.upload_file(
|
59 |
+
path_or_fileobj=json_filepath,
|
60 |
+
path_in_repo=json_filepath.replace(local_dir, ""),
|
61 |
+
repo_id=hf_repo,
|
62 |
+
repo_type="dataset",
|
63 |
+
)
|
64 |
+
|
65 |
+
|
66 |
+
def get_eval_requests(job_status: list, local_dir: str, hf_repo: str) -> list[EvalRequest]:
|
67 |
+
"""Get all pending evaluation requests and return a list in which private
|
68 |
+
models appearing first, followed by public models sorted by the number of
|
69 |
+
likes.
|
70 |
+
|
71 |
+
Returns:
|
72 |
+
`list[EvalRequest]`: a list of model info dicts.
|
73 |
+
"""
|
74 |
+
snapshot_download(repo_id=hf_repo, revision="main", local_dir=local_dir, repo_type="dataset", max_workers=60, token=TOKEN)
|
75 |
+
json_files = glob.glob(f"{local_dir}/**/*.json", recursive=True)
|
76 |
+
|
77 |
+
eval_requests = []
|
78 |
+
for json_filepath in json_files:
|
79 |
+
with open(json_filepath) as fp:
|
80 |
+
data = json.load(fp)
|
81 |
+
if data["status"] in job_status:
|
82 |
+
data["json_filepath"] = json_filepath
|
83 |
+
eval_request = EvalRequest(**data)
|
84 |
+
eval_requests.append(eval_request)
|
85 |
+
|
86 |
+
return eval_requests
|
87 |
+
|
88 |
+
|
89 |
+
def check_completed_evals(
|
90 |
+
api: HfApi,
|
91 |
+
hf_repo: str,
|
92 |
+
local_dir: str,
|
93 |
+
checked_status: str,
|
94 |
+
completed_status: str,
|
95 |
+
failed_status: str,
|
96 |
+
hf_repo_results: str,
|
97 |
+
local_dir_results: str,
|
98 |
+
):
|
99 |
+
"""Checks if the currently running evals are completed, if yes, update their status on the hub."""
|
100 |
+
snapshot_download(repo_id=hf_repo_results, revision="main", local_dir=local_dir_results, repo_type="dataset", max_workers=60, token=TOKEN)
|
101 |
+
|
102 |
+
running_evals = get_eval_requests(checked_status, hf_repo=hf_repo, local_dir=local_dir)
|
103 |
+
|
104 |
+
for eval_request in running_evals:
|
105 |
+
model = eval_request.model
|
106 |
+
print("====================================")
|
107 |
+
print(f"Checking {model}")
|
108 |
+
|
109 |
+
output_path = model
|
110 |
+
output_file = f"{local_dir_results}/{output_path}/results*.json"
|
111 |
+
output_file_exists = len(glob.glob(output_file)) > 0
|
112 |
+
|
113 |
+
if output_file_exists:
|
114 |
+
print(
|
115 |
+
f"EXISTS output file exists for {model} setting it to {completed_status}"
|
116 |
+
)
|
117 |
+
set_eval_request(api, eval_request, completed_status, hf_repo, local_dir)
|
118 |
+
else:
|
119 |
+
print(
|
120 |
+
f"No result file found for {model} setting it to {failed_status}"
|
121 |
+
)
|
122 |
+
set_eval_request(api, eval_request, failed_status, hf_repo, local_dir)
|
src/backend/run_eval_suite.py
ADDED
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import os
|
3 |
+
import logging
|
4 |
+
from datetime import datetime
|
5 |
+
|
6 |
+
from lm_eval import tasks, evaluator, utils
|
7 |
+
|
8 |
+
from src.envs import RESULTS_REPO, API
|
9 |
+
from src.backend.manage_requests import EvalRequest
|
10 |
+
|
11 |
+
logging.getLogger("openai").setLevel(logging.WARNING)
|
12 |
+
|
13 |
+
def run_evaluation(eval_request: EvalRequest, task_names, num_fewshot, batch_size, device, local_dir: str, results_repo: str, no_cache=True, limit=None):
|
14 |
+
if limit:
|
15 |
+
print(
|
16 |
+
"WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT."
|
17 |
+
)
|
18 |
+
|
19 |
+
task_names = utils.pattern_match(task_names, tasks.ALL_TASKS)
|
20 |
+
|
21 |
+
print(f"Selected Tasks: {task_names}")
|
22 |
+
|
23 |
+
results = evaluator.simple_evaluate(
|
24 |
+
model="hf-causal-experimental", # "hf-causal"
|
25 |
+
model_args=eval_request.get_model_args(),
|
26 |
+
tasks=task_names,
|
27 |
+
num_fewshot=num_fewshot,
|
28 |
+
batch_size=batch_size,
|
29 |
+
device=device,
|
30 |
+
no_cache=no_cache,
|
31 |
+
limit=limit,
|
32 |
+
write_out=True,
|
33 |
+
output_base_path="logs"
|
34 |
+
)
|
35 |
+
|
36 |
+
results["config"]["model_dtype"] = eval_request.precision
|
37 |
+
results["config"]["model_name"] = eval_request.model
|
38 |
+
results["config"]["model_sha"] = eval_request.revision
|
39 |
+
|
40 |
+
dumped = json.dumps(results, indent=2)
|
41 |
+
print(dumped)
|
42 |
+
|
43 |
+
output_path = os.path.join(local_dir, *eval_request.model.split("/"), f"results_{datetime.now()}.json")
|
44 |
+
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
45 |
+
with open(output_path, "w") as f:
|
46 |
+
f.write(dumped)
|
47 |
+
|
48 |
+
print(evaluator.make_table(results))
|
49 |
+
|
50 |
+
API.upload_file(
|
51 |
+
path_or_fileobj=output_path,
|
52 |
+
path_in_repo=f"{eval_request.model}/results_{datetime.now()}.json",
|
53 |
+
repo_id=results_repo,
|
54 |
+
repo_type="dataset",
|
55 |
+
)
|
56 |
+
|
57 |
+
return results
|
src/backend/sort_queue.py
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
from dataclasses import dataclass
|
3 |
+
|
4 |
+
from huggingface_hub import HfApi
|
5 |
+
|
6 |
+
from src.backend.manage_requests import EvalRequest
|
7 |
+
|
8 |
+
|
9 |
+
@dataclass
|
10 |
+
class ModelMetadata:
|
11 |
+
likes: int = 0
|
12 |
+
size: int = 15
|
13 |
+
|
14 |
+
|
15 |
+
def sort_models_by_priority(api: HfApi, models: list[EvalRequest]) -> list[EvalRequest]:
|
16 |
+
private_models = [model for model in models if model.private]
|
17 |
+
public_models = [model for model in models if not model.private]
|
18 |
+
|
19 |
+
return sort_by_submit_date(private_models) + sort_by_submit_date(public_models)
|
20 |
+
|
21 |
+
def sort_by_submit_date(eval_requests: list[EvalRequest]) -> list[EvalRequest]:
|
22 |
+
return sorted(eval_requests, key=lambda x: x.submitted_time, reverse=False)
|
23 |
+
|
24 |
+
def sort_by_size(eval_requests: list[EvalRequest]) -> list[EvalRequest]:
|
25 |
+
return sorted(eval_requests, key=lambda x: x.params, reverse=False)
|
26 |
+
|
27 |
+
def sort_by_likes(eval_requests: list[EvalRequest]) -> list[EvalRequest]:
|
28 |
+
return sorted(eval_requests, key=lambda x: x.likes, reverse=False)
|
src/display/formatting.py
CHANGED
@@ -1,12 +1,3 @@
|
|
1 |
-
import os
|
2 |
-
from datetime import datetime, timezone
|
3 |
-
|
4 |
-
from huggingface_hub import HfApi
|
5 |
-
from huggingface_hub.hf_api import ModelInfo
|
6 |
-
|
7 |
-
|
8 |
-
API = HfApi()
|
9 |
-
|
10 |
def model_hyperlink(link, model_name):
|
11 |
return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
|
12 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
def model_hyperlink(link, model_name):
|
2 |
return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
|
3 |
|
src/display/utils.py
CHANGED
@@ -3,7 +3,7 @@ from enum import Enum
|
|
3 |
|
4 |
import pandas as pd
|
5 |
|
6 |
-
from src.
|
7 |
|
8 |
def fields(raw_class):
|
9 |
return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
|
@@ -94,9 +94,10 @@ class WeightType(Enum):
|
|
94 |
class Precision(Enum):
|
95 |
float16 = ModelDetails("float16")
|
96 |
bfloat16 = ModelDetails("bfloat16")
|
97 |
-
|
98 |
-
|
99 |
-
|
|
|
100 |
Unknown = ModelDetails("?")
|
101 |
|
102 |
def from_str(precision):
|
@@ -104,12 +105,14 @@ class Precision(Enum):
|
|
104 |
return Precision.float16
|
105 |
if precision in ["torch.bfloat16", "bfloat16"]:
|
106 |
return Precision.bfloat16
|
107 |
-
if precision in ["
|
108 |
-
return Precision.
|
109 |
-
if precision in ["
|
110 |
-
|
111 |
-
if precision in ["
|
112 |
-
|
|
|
|
|
113 |
return Precision.Unknown
|
114 |
|
115 |
# Column selection
|
|
|
3 |
|
4 |
import pandas as pd
|
5 |
|
6 |
+
from src.about import Tasks
|
7 |
|
8 |
def fields(raw_class):
|
9 |
return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
|
|
|
94 |
class Precision(Enum):
|
95 |
float16 = ModelDetails("float16")
|
96 |
bfloat16 = ModelDetails("bfloat16")
|
97 |
+
float32 = ModelDetails("float32")
|
98 |
+
#qt_8bit = ModelDetails("8bit")
|
99 |
+
#qt_4bit = ModelDetails("4bit")
|
100 |
+
#qt_GPTQ = ModelDetails("GPTQ")
|
101 |
Unknown = ModelDetails("?")
|
102 |
|
103 |
def from_str(precision):
|
|
|
105 |
return Precision.float16
|
106 |
if precision in ["torch.bfloat16", "bfloat16"]:
|
107 |
return Precision.bfloat16
|
108 |
+
if precision in ["float32"]:
|
109 |
+
return Precision.float32
|
110 |
+
#if precision in ["8bit"]:
|
111 |
+
# return Precision.qt_8bit
|
112 |
+
#if precision in ["4bit"]:
|
113 |
+
# return Precision.qt_4bit
|
114 |
+
#if precision in ["GPTQ", "None"]:
|
115 |
+
# return Precision.qt_GPTQ
|
116 |
return Precision.Unknown
|
117 |
|
118 |
# Column selection
|
src/envs.py
CHANGED
@@ -2,19 +2,27 @@ import os
|
|
2 |
|
3 |
from huggingface_hub import HfApi
|
4 |
|
5 |
-
#
|
6 |
-
|
|
|
|
|
|
|
|
|
|
|
7 |
|
8 |
OWNER = "openlifescienceai"
|
9 |
REPO_ID = f"{OWNER}/open_medical_llm_leaderboard"
|
10 |
QUEUE_REPO = f"{OWNER}/requests"
|
11 |
RESULTS_REPO = f"{OWNER}/results"
|
12 |
|
|
|
|
|
13 |
CACHE_PATH=os.getenv("HF_HOME", ".")
|
14 |
-
# print("CACHE_PATH", CACHE_PATH)
|
15 |
|
16 |
# Local caches
|
17 |
EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
|
18 |
EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
|
|
|
|
|
19 |
|
20 |
API = HfApi(token=TOKEN)
|
|
|
2 |
|
3 |
from huggingface_hub import HfApi
|
4 |
|
5 |
+
# Info to change for your repository
|
6 |
+
# ----------------------------------
|
7 |
+
TOKEN = os.environ.get("TOKEN") # A read/write token for your org
|
8 |
+
|
9 |
+
DEVICE = "cpu" # "cuda:0" if you add compute
|
10 |
+
LIMIT = None # !!!! Should be None for actual evaluations!!!
|
11 |
+
# ----------------------------------
|
12 |
|
13 |
OWNER = "openlifescienceai"
|
14 |
REPO_ID = f"{OWNER}/open_medical_llm_leaderboard"
|
15 |
QUEUE_REPO = f"{OWNER}/requests"
|
16 |
RESULTS_REPO = f"{OWNER}/results"
|
17 |
|
18 |
+
|
19 |
+
# If you setup a cache later, just change HF_HOME
|
20 |
CACHE_PATH=os.getenv("HF_HOME", ".")
|
|
|
21 |
|
22 |
# Local caches
|
23 |
EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
|
24 |
EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
|
25 |
+
EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
|
26 |
+
EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
|
27 |
|
28 |
API = HfApi(token=TOKEN)
|
src/leaderboard/read_evals.py
CHANGED
@@ -103,7 +103,7 @@ class EvalResult:
|
|
103 |
self.num_params = request.get("params", 0)
|
104 |
self.date = request.get("submitted_time", "")
|
105 |
except Exception:
|
106 |
-
print(f"Could not find request file for {self.org}/{self.model}")
|
107 |
|
108 |
def to_dict(self):
|
109 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
@@ -139,8 +139,6 @@ def get_request_file_for_model(requests_path, model_name, precision):
|
|
139 |
)
|
140 |
request_files = glob.glob(request_files)
|
141 |
|
142 |
-
print("yahaa", request_files)
|
143 |
-
|
144 |
# Select correct request file (precision)
|
145 |
request_file = ""
|
146 |
request_files = sorted(request_files, reverse=True)
|
|
|
103 |
self.num_params = request.get("params", 0)
|
104 |
self.date = request.get("submitted_time", "")
|
105 |
except Exception:
|
106 |
+
print(f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}")
|
107 |
|
108 |
def to_dict(self):
|
109 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
|
|
139 |
)
|
140 |
request_files = glob.glob(request_files)
|
141 |
|
|
|
|
|
142 |
# Select correct request file (precision)
|
143 |
request_file = ""
|
144 |
request_files = sorted(request_files, reverse=True)
|
src/submission/check_validity.py
CHANGED
@@ -8,7 +8,7 @@ import huggingface_hub
|
|
8 |
from huggingface_hub import ModelCard
|
9 |
from huggingface_hub.hf_api import ModelInfo
|
10 |
from transformers import AutoConfig
|
11 |
-
from transformers.models.auto.tokenization_auto import
|
12 |
|
13 |
def check_model_card(repo_id: str) -> tuple[bool, str]:
|
14 |
"""Checks if the model card and license exist and have been filled"""
|
@@ -31,32 +31,20 @@ def check_model_card(repo_id: str) -> tuple[bool, str]:
|
|
31 |
|
32 |
return True, ""
|
33 |
|
34 |
-
|
35 |
def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_remote_code=False, test_tokenizer=False) -> tuple[bool, str]:
|
36 |
-
"""Makes sure the model is on the hub, and uses a valid configuration (in the latest transformers version)"""
|
37 |
try:
|
38 |
-
|
39 |
-
print("this is input :", model_name, revision, token, trust_remote_code, test_tokenizer)
|
40 |
config = AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
|
41 |
-
print("getting config", config)
|
42 |
if test_tokenizer:
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
tokenizer_class_candidate = tokenizer_config.get("tokenizer_class", None)
|
47 |
-
else:
|
48 |
-
tokenizer_class_candidate = config.tokenizer_class
|
49 |
-
|
50 |
-
|
51 |
-
tokenizer_class = tokenizer_class_from_name(tokenizer_class_candidate)
|
52 |
-
|
53 |
-
if tokenizer_class is None:
|
54 |
return (
|
55 |
False,
|
56 |
-
f"uses
|
57 |
None
|
58 |
)
|
59 |
-
|
|
|
60 |
return True, None, config
|
61 |
|
62 |
except ValueError:
|
@@ -67,7 +55,6 @@ def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_rem
|
|
67 |
)
|
68 |
|
69 |
except Exception as e:
|
70 |
-
print("exception is", e)
|
71 |
return False, "was not found on hub!", None
|
72 |
|
73 |
|
@@ -107,4 +94,4 @@ def already_submitted_models(requested_models_dir: str) -> set[str]:
|
|
107 |
organisation, _ = info["model"].split("/")
|
108 |
users_to_submission_dates[organisation].append(info["submitted_time"])
|
109 |
|
110 |
-
return set(file_names), users_to_submission_dates
|
|
|
8 |
from huggingface_hub import ModelCard
|
9 |
from huggingface_hub.hf_api import ModelInfo
|
10 |
from transformers import AutoConfig
|
11 |
+
from transformers.models.auto.tokenization_auto import AutoTokenizer
|
12 |
|
13 |
def check_model_card(repo_id: str) -> tuple[bool, str]:
|
14 |
"""Checks if the model card and license exist and have been filled"""
|
|
|
31 |
|
32 |
return True, ""
|
33 |
|
|
|
34 |
def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_remote_code=False, test_tokenizer=False) -> tuple[bool, str]:
|
|
|
35 |
try:
|
|
|
|
|
36 |
config = AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
|
|
|
37 |
if test_tokenizer:
|
38 |
+
try:
|
39 |
+
tk = AutoTokenizer.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
|
40 |
+
except ValueError as e:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
41 |
return (
|
42 |
False,
|
43 |
+
f"uses a tokenizer which is not in a transformers release: {e}",
|
44 |
None
|
45 |
)
|
46 |
+
except Exception as e:
|
47 |
+
return (False, "'s tokenizer cannot be loaded. Is your tokenizer class in a stable transformers release, and correctly configured?", None)
|
48 |
return True, None, config
|
49 |
|
50 |
except ValueError:
|
|
|
55 |
)
|
56 |
|
57 |
except Exception as e:
|
|
|
58 |
return False, "was not found on hub!", None
|
59 |
|
60 |
|
|
|
94 |
organisation, _ = info["model"].split("/")
|
95 |
users_to_submission_dates[organisation].append(info["submitted_time"])
|
96 |
|
97 |
+
return set(file_names), users_to_submission_dates
|
src/submission/submit.py
CHANGED
@@ -50,7 +50,7 @@ def add_new_eval(
|
|
50 |
return styled_error(f'Base model "{base_model}" {error}')
|
51 |
|
52 |
if not weight_type == "Adapter":
|
53 |
-
model_on_hub, error, _ = is_model_on_hub(model_name=model, revision=revision, test_tokenizer=True)
|
54 |
if not model_on_hub:
|
55 |
return styled_error(f'Model "{model}" {error}')
|
56 |
|
@@ -87,6 +87,7 @@ def add_new_eval(
|
|
87 |
"likes": model_info.likes,
|
88 |
"params": model_size,
|
89 |
"license": license,
|
|
|
90 |
}
|
91 |
|
92 |
# Check for duplicate submission
|
|
|
50 |
return styled_error(f'Base model "{base_model}" {error}')
|
51 |
|
52 |
if not weight_type == "Adapter":
|
53 |
+
model_on_hub, error, _ = is_model_on_hub(model_name=model, revision=revision, token=TOKEN, test_tokenizer=True)
|
54 |
if not model_on_hub:
|
55 |
return styled_error(f'Model "{model}" {error}')
|
56 |
|
|
|
87 |
"likes": model_info.likes,
|
88 |
"params": model_size,
|
89 |
"license": license,
|
90 |
+
"private": False,
|
91 |
}
|
92 |
|
93 |
# Check for duplicate submission
|