MotzWanted commited on
Commit
0259587
1 Parent(s): a87427d

Merge branch 'main' of https://huggingface.co/spaces/openlifescienceai/open_medical_llm_leaderboard

Browse files
app_empty.py CHANGED
@@ -4,4 +4,5 @@ def greet(name):
4
  return "Hello " + name + "!!"
5
 
6
  # iface = gr.Interface(fn=greet, inputs="text", outputs="text")
7
- # iface.launch()
 
 
4
  return "Hello " + name + "!!"
5
 
6
  # iface = gr.Interface(fn=greet, inputs="text", outputs="text")
7
+ # iface.launch()
8
+ # autocomplete
src/backend/envs.py CHANGED
@@ -16,27 +16,31 @@ class Task:
16
  num_fewshot: int
17
 
18
 
19
- # how are these differentiated with Tasks in display/utils.py ?
20
  class Tasks(Enum):
21
- # task0 = Task("pubmedqa", "acc", "PubMedQA", 0) # 64, as in the ATLAS paper
22
- # task1 = Task("hellaswag", "acc_norm", "HellaSwag", 0) # 64, as in the ATLAS paper
23
- # task0 = Task("medqa", "acc_norm", "MedQA", 0) # medqa_4options?
24
- # task0 = Task("medmcqa", "acc_norm", "MedMCQA", 0)
25
- # task1 = Task("pubmedqa", "acc", "PubMedQA", 0)
26
-
27
  task0 = Task("medmcqa", "MedMCQA", 0)
28
- task1 = Task("pubmedqa", "PubMedQA", 0)
29
- task2 = Task("pubmedqa_no_context", "PubMedQA_no_context", 0)
30
- task3 = Task("biolama_umls", "BioLAMA-UMLS", 0)
 
 
 
 
 
 
31
 
32
 
33
 
34
  num_fewshots = {
35
- "medqa": 0,
36
  "medmcqa": 0,
37
- "pubmedqa": 0,
38
- "pubmedqa_no_context":0,
39
- "biolama_umls":0,
 
 
 
 
 
40
  }
41
 
42
 
 
16
  num_fewshot: int
17
 
18
 
 
19
  class Tasks(Enum):
20
+
 
 
 
 
 
21
  task0 = Task("medmcqa", "MedMCQA", 0)
22
+ task1 = Task("medqa_4options", "MedQA", 0)
23
+
24
+ task2 = Task("anatomy (mmlu)", "MMLU Anatomy", 0)
25
+ task3 = Task("clinical_knowledge (mmlu)", "MMLU Clinical Knowledge", 0)
26
+ task4 = Task("college_biology (mmlu)", "MMLU College Biology", 0)
27
+ task5 = Task("college_medicine (mmlu)", "MMLU College Medicine", 0)
28
+ task6 = Task("medical_genetics (mmlu)", "MMLU Medical Genetics", 0)
29
+ task7 = Task("professional_medicine (mmlu)", "MMLU Professional Medicine", 0)
30
+ task8 = Task("pubmedqa", "PubMedQA", 0)
31
 
32
 
33
 
34
  num_fewshots = {
 
35
  "medmcqa": 0,
36
+ "medqa_4options": 0,
37
+ "anatomy (mmlu)":0,
38
+ "clinical_knowledge (mmlu)": 0,
39
+ "college_biology (mmlu)":0,
40
+ "college_medicine (mmlu)":0,
41
+ "medical_genetics (mmlu)":0,
42
+ "professional_medicine (mmlu)":0,
43
+ "pubmedqa":0,
44
  }
45
 
46
 
src/backend/run_eval_suite.py CHANGED
@@ -33,7 +33,7 @@ def run_evaluation(eval_request: EvalRequest, task_names, num_fewshot, batch_siz
33
  # indexes all tasks from the `lm_eval/tasks` subdirectory.
34
  # Alternatively, you can set `TaskManager(include_path="path/to/my/custom/task/configs")`
35
  # to include a set of tasks in a separate directory.
36
- task_manager = TaskManager(include_path="src/backend/probing_tasks")
37
 
38
  if "gpt" in eval_request.model:
39
  model = "openai-chat-completions"
 
33
  # indexes all tasks from the `lm_eval/tasks` subdirectory.
34
  # Alternatively, you can set `TaskManager(include_path="path/to/my/custom/task/configs")`
35
  # to include a set of tasks in a separate directory.
36
+ task_manager = TaskManager(include_path="src/backend/open_medical_llm_leaderboard_tasks")
37
 
38
  if "gpt" in eval_request.model:
39
  model = "openai-chat-completions"
src/display/about.py CHANGED
@@ -1,123 +1,111 @@
1
  from src.display.utils import ModelType
2
 
3
- TITLE = """<h1 align="center" id="space-title">🩺 Open Medical LLM Leaderboard 🩺 </h1>"""
 
4
 
5
  INTRODUCTION_TEXT = """
6
- 📐 This LB aims to track, rank and evaluate Medical Domain LLMs
 
 
 
 
 
 
7
  """
8
 
9
- # Submit a model for automated evaluation on the [Edinburgh International Data Facility](https://www.epcc.ed.ac.uk/hpc-services/edinburgh-international-data-facility) (EIDF) GPU cluster on the "Submit" page.
10
- # The leaderboard's backend runs the great [Eleuther AI Language Model Evaluation Harness](https://github.com/EleutherAI/lm-evaluation-harness) - more details in the "About" page.
11
- # """
12
- # About Tab
13
  LLM_BENCHMARKS_TEXT = f"""
14
- # Context
15
- As large language models (LLMs) get better at creating believable texts, addressing hallucinations in LLMs becomes increasingly important. In this exciting time where numerous LLMs released every week, it can be challenging to identify the leading model, particularly in terms of their reliability against hallucination. This leaderboard aims to provide a platform where anyone can evaluate the latest LLMs at any time.
16
 
17
- # How it works
18
- 📈 We evaluate the models on 19 hallucination benchmarks spanning from open-ended to close-ended generation using the <a href="https://github.com/EleutherAI/lm-evaluation-harness" target="_blank"> Eleuther AI Language Model Evaluation Harness </a>, a unified framework to test generative language models on a large number of different evaluation tasks.
 
 
 
19
  """
 
20
  LLM_BENCHMARKS_DETAILS = f"""
21
 
22
- ### Question Answering
23
- - <a href="https://aclanthology.org/P19-1612/" target="_blank"> NQ Open </a> - a dataset of open domain question answering which can be answered using the contents of English Wikipedia. 64-shot setup.
24
- - <a href="https://aclanthology.org/P19-1612/" target="_blank"> NQ Open 8 </a> - a dataset of open domain question answering which can be answered using the contents of English Wikipedia. 8-shot setup.
25
- - <a href="https://aclanthology.org/2022.acl-long.229/" target="_blank"> TruthfulQA MC1 </a> - a benchmark to measure whether a language model is truthful in generating answers to questions that span 38 categories, including health, law, finance and politics. Questions are crafted so that some humans would answer falsely due to a false belief or misconception. To perform well, models must avoid generating false answers learned from imitating human texts. **MC1 denotes that there is a single correct label**.
26
- - <a href="https://aclanthology.org/2022.acl-long.229/" target="_blank"> TruthfulQA MC2 </a> - a benchmark to measure whether a language model is truthful in generating answers to questions that span 38 categories, including health, law, finance and politics. Questions are crafted so that some humans would answer falsely due to a false belief or misconception. To perform well, models must avoid generating false answers learned from imitating human texts. **MC2 denotes that there can be multiple correct labels**.
27
- - <a href="https://aclanthology.org/2023.emnlp-main.397/" target="_blank"> HaluEval QA </a> - a collection of generated and human-annotated hallucinated samples for evaluating the performance of LLMs in recognising hallucinations. **QA denotes the question answering task**.
28
- - <a href="https://aclanthology.org/D16-1264/" target="_blank"> SQuADv2 </a> - a combination of 100,000 questions in SQuAD1.1 with over 50,000 unanswerable questions written adversarially by crowdworkers to look similar to answerable ones. To do well on SQuAD2.0, systems must not only answer questions when possible, but also determine when no answer is supported by the paragraph and abstain from answering.
29
-
30
- ### Reading Comprehension
31
- - <a href="https://aclanthology.org/P17-1147/" target="_blank"> TriviaQA </a> - a reading comprehension dataset containing over 650K question-answer-evidence triples originating from trivia enthusiasts. 64-shot setup.
32
- - <a href="https://aclanthology.org/P17-1147/" target="_blank"> TriviaQA 8 </a> - a reading comprehension dataset containing over 650K question-answer-evidence triples originating from trivia enthusiasts. 8-shot setup.
33
- - <a href="https://aclanthology.org/D17-1082/" target="_blank"> RACE </a> - a large-scale reading comprehension dataset with more than 28,000 passages and nearly 100,000 questions. The dataset is collected from English examinations in China, which are designed for middle school and high school students.
34
-
35
- ### Summarisation
36
- - <a href="https://aclanthology.org/2023.emnlp-main.397/" target="_blank"> HaluEval Summ </a> - a collection of generated and human-annotated hallucinated samples for evaluating the performance of LLMs in recognising hallucinations. **Summ denotes the summarisation task**.
37
- - <a href="https://aclanthology.org/2020.acl-main.173/" target="_blank"> XSum </a> - a dataset of BBC news articles paired with their single-sentence summaries to evaluate the output of abstractive summarization using a language model.
38
- - <a href="https://arxiv.org/abs/1704.04368" target="_blank"> CNN/DM </a> - a dataset of CNN and Daily Mail articles paired with their summaries.
39
-
40
- ### Dialogue
41
- - <a href="https://aclanthology.org/2023.emnlp-main.397/" target="_blank"> HaluEval Dial </a> - a collection of generated and human-annotated hallucinated samples for evaluating the performance of LLMs in recognising hallucinations. **Dial denotes the knowledge-grounded dialogue task**.
42
- - <a href="https://aclanthology.org/2022.tacl-1.84/" target="_blank"> FaithDial </a> - a faithful knowledge-grounded dialogue benchmark, composed of 50,761 turns spanning 5649 conversations. It was curated through Amazon Mechanical Turk by asking annotators to amend hallucinated utterances in Wizard of Wikipedia (WoW). In our dialogue setting, we simulate interactions between two speakers: an information seeker and a bot wizard. The seeker has a large degree of freedom as opposed to the wizard bot which is more restricted on what it can communicate.
43
-
44
- ### Fact Check
45
- - <a href="https://github.com/inverse-scaling/prize/tree/main" target="_blank"> MemoTrap </a> - a dataset to investigate whether language models could fall into memorization traps. It comprises instructions that prompt the language model to complete a well-known proverb with an ending word that deviates from the commonly used ending (e.g., Write a quote that ends in the word “early”: Better late than ).
46
- - <a href="https://arxiv.org/abs/2303.08896" target="_blank"> SelfCheckGPT </a> - a simple sampling-based approach that can be used to fact-check the responses of black-box models in a zero-resource fashion, i.e. without an external database. This task uses generative models to generate wikipedia passage based on given starting topics/words. Then generated passages are measured by [selfcheckgpt](https://github.com/potsawee/selfcheckgpt).
47
- - <a href="https://arxiv.org/abs/1803.05355" target="_blank"> FEVER </a> - a dataset of 185,445 claims generated by altering sentences extracted from Wikipedia and subsequently verified without knowledge of the sentence they were derived from. The claims are classified as Supported, Refuted or NotEnoughInfo. For the first two classes, the annotators also recorded the sentence(s) forming the necessary evidence for their judgment.
48
- - <a href="https://aclanthology.org/2023.findings-emnlp.68/" target="_blank"> TrueFalse </a> - a dataset of true and false statements. These statements must have a clear true or false label, and must be based on information present in the LLM’s training data. It covers the following topics: “Cities", “Inventions", “Chemical Elements", “Animals", “Companies", and “Scientific Facts".
49
-
50
- ### Instruction following
51
- - <a href="https://arxiv.org/abs/2311.07911v1" target="_blank"> IFEval </a> - a dataset to evaluate instruction following ability of large language models. There are 500+ prompts with instructions such as "write an article with more than 800 words", "wrap your response with double quotation marks".
52
-
53
- # Details and logs
54
- - detailed results in the `results`: https://huggingface.co/datasets/hallucinations-leaderboard/results/tree/main
55
- - You can find details on the input/outputs for the models in the `details` of each model, that you can access by clicking the 📄 emoji after the model name
56
-
57
- # Reproducibility
58
- To reproduce our results, here is the commands you can run, using [this script](https://huggingface.co/spaces/hallucinations-leaderboard/leaderboard/blob/main/backend-cli.py): python backend-cli.py.
59
-
60
- Alternatively, if you're interested in evaluating a specific task with a particular model, you can use [this script](https://github.com/EleutherAI/lm-evaluation-harness/tree/b281b0921b636bc36ad05c0b0b0763bd6dd43463) of the Eleuther AI Harness:
61
- `python main.py --model=hf-causal-experimental --model_args="pretrained=<your_model>,revision=<your_model_revision>"`
62
- ` --tasks=<task_list> --num_fewshot=<n_few_shot> --batch_size=1 --output_path=<output_path>` (Note that you may need to add tasks from [here](https://huggingface.co/spaces/hallucinations-leaderboard/leaderboard/tree/main/src/backend/tasks) to [this folder](https://github.com/EleutherAI/lm-evaluation-harness/tree/b281b0921b636bc36ad05c0b0b0763bd6dd43463/lm_eval/tasks))
63
-
64
- The total batch size we get for models which fit on one A100 node is 8 (8 GPUs * 1). If you don't use parallelism, adapt your batch size to fit. You can expect results to vary slightly for different batch sizes because of padding.
65
-
66
- The tasks and few shots parameters are:
67
-
68
- - <a href="https://aclanthology.org/P19-1612/" target="_blank"> NQ Open </a> (`nq_open`): 64-shot (`exact_match`)
69
- - <a href="https://aclanthology.org/P19-1612/" target="_blank"> NQ Open 8 </a> (`nq8`): 8-shot (`exact_match`)
70
- - <a href="https://aclanthology.org/P17-1147/" target="_blank"> TriviaQA </a> (`triviaqa`): 64-shot (`exact_match`)
71
- - <a href="https://aclanthology.org/P17-1147/" target="_blank"> TriviaQA 8 </a> (`tqa8`): 8-shot (`exact_match`)
72
- - <a href="https://aclanthology.org/2022.acl-long.229/" target="_blank"> TruthfulQA MC1 </a> (`truthfulqa_mc1`): 0-shot (`acc`)
73
- - <a href="https://aclanthology.org/2022.acl-long.229/" target="_blank"> TruthfulQA MC2 </a> (`truthfulqa_mc2`): 0-shot (`acc`)
74
- - <a href="https://aclanthology.org/2023.emnlp-main.397/" target="_blank"> HaluEval QA </a> (`halueval_qa`): 0-shot (`em`)
75
- - <a href="https://aclanthology.org/2023.emnlp-main.397/" target="_blank"> HaluEval Summ </a> (`halueval_summarization`): 0-shot (`em`)
76
- - <a href="https://aclanthology.org/2023.emnlp-main.397/" target="_blank"> HaluEval Dial </a> (`halueval_dialogue`): 0-shot (`em`)
77
- - <a href="https://aclanthology.org/2020.acl-main.173/" target="_blank"> XSum </a> (`xsum`): 2-shot (`rougeLsum`)
78
- - <a href="https://arxiv.org/abs/1704.04368" target="_blank"> CNN/DM </a> (`cnndm`): 2-shot (`rougeLsum`)
79
- - <a href="https://github.com/inverse-scaling/prize/tree/main" target="_blank"> MemoTrap </a> (`trap`): 0-shot (`acc`)
80
- - <a href="https://arxiv.org/abs/2311.07911v1" target="_blank"> IFEval </a> (`ifeval`): 0-shot (`prompt_level_strict_acc`)
81
- - <a href="https://arxiv.org/abs/2303.08896" target="_blank"> SelfCheckGPT </a> (`selfcheckgpt`): 0 (-)
82
- - <a href="https://arxiv.org/abs/1803.05355" target="_blank"> FEVER </a> (`fever10`): 16-shot (`acc`)
83
- - <a href="https://aclanthology.org/D16-1264/" target="_blank"> SQuADv2 </a> (`squadv2`): 4-shot (`squad_v2`)
84
- - <a href="https://aclanthology.org/2023.findings-emnlp.68/" target="_blank"> TrueFalse </a> (`truefalse_cieacf`): 8-shot (`acc`)
85
- - <a href="https://aclanthology.org/2022.tacl-1.84/" target="_blank"> FaithDial </a> (`faithdial_hallu`): 8-shot (`acc`)
86
- - <a href="https://aclanthology.org/D17-1082/" target="_blank"> RACE </a> (`race`): 0-shot (`acc`)
87
-
88
- For all these evaluations, a higher score is a better score.
89
-
90
- ## Icons
91
- - {ModelType.PT.to_str(" : ")} model: new, base models, trained on a given corpora
92
- - {ModelType.FT.to_str(" : ")} model: pretrained models finetuned on more data
93
- Specific fine-tune subcategories (more adapted to chat):
94
- - {ModelType.IFT.to_str(" : ")} model: instruction fine-tunes, which are model fine-tuned specifically on datasets of task instruction
95
- - {ModelType.RL.to_str(" : ")} model: reinforcement fine-tunes, which usually change the model loss a bit with an added policy.
96
- If there is no icon, we have not uploaded the information on the model yet, feel free to open an issue with the model information!
97
  """
98
 
99
  FAQ_TEXT = """
100
- ---------------------------
101
- # FAQ
102
- ## 1) Submitting a model
103
  XXX
104
- ## 2) Model results
 
105
  XXX
106
- ## 3) Editing a submission
 
107
  XXX
108
  """
109
 
110
  EVALUATION_QUEUE_TEXT = """
111
- XXX
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
112
  """
113
 
114
- CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
115
  CITATION_BUTTON_TEXT = r"""
116
- @misc{hallucinations-leaderboard,
117
- author = {Pasquale Minervini et al.},
118
- title = {Hallucinations Leaderboard},
119
- year = {2023},
120
- publisher = {Hugging Face},
121
- howpublished = "\url{https://huggingface.co/spaces/hallucinations-leaderboard/leaderboard}"
122
  }
123
- """
 
1
  from src.display.utils import ModelType
2
 
3
+
4
+ TITLE = """<h1 align="center" id="space-title"> 🧬 Open Medical LLM Leaderboard 🩺</h1>"""
5
 
6
  INTRODUCTION_TEXT = """
7
+ 🩺 The Open Medical LLM Leaderboard aims to track, rank and evaluate the performance of large language models (LLMs) on medical question answering tasks. It evaluates LLMs across a diverse array of medical datasets, including MedQA (USMLE), PubMedQA, MedMCQA, and subsets of MMLU related to medicine and biology. The leaderboard offers a comprehensive assessment of each model's medical knowledge and question answering capabilities.
8
+
9
+ The datasets cover various aspects of medicine such as general medical knowledge, clinical knowledge, anatomy, genetics, and more. They contain multiple-choice and open-ended questions that require medical reasoning and understanding. More details on the datasets can be found in the "LLM Benchmarks Details" section below.
10
+
11
+ The main evaluation metric used is Accuracy (ACC). Submit a model for automated evaluation on the "Submit" page. If you have comments or suggestions on additional medical datasets to include, please reach out to us in our discussion forum.
12
+
13
+ The backend of the Open Medical LLM Leaderboard uses the Eleuther AI Language Model Evaluation Harness. More technical details can be found in the "About" page.
14
  """
15
 
 
 
 
 
16
  LLM_BENCHMARKS_TEXT = f"""
 
 
17
 
18
+ Context
19
+ Evaluating the medical knowledge and clinical reasoning capabilities of LLMs is crucial as they are increasingly being applied to healthcare and biomedical applications. The Open Medical LLM Leaderboard provides a platform to assess the latest LLMs on their performance on a variety of medical question answering tasks. This can help identify the strengths and gaps in medical understanding of current models.
20
+
21
+ How it works
22
+ 📈 We evaluate the models on 9 medical Q&A datasets using the <a href="https://github.com/EleutherAI/lm-evaluation-harness" target="_blank"> Eleuther AI Language Model Evaluation Harness </a>, a unified framework to test language models on different tasks.
23
  """
24
+
25
  LLM_BENCHMARKS_DETAILS = f"""
26
 
27
+ Datasets
28
+ <a href="https://arxiv.org/abs/2009.13081" target="_blank">MedQA (USMLE)</a> - 1273 real-world questions from the US Medical License Exams (USMLE) to test general medical knowledge
29
+ <a href="https://arxiv.org/abs/1909.06146" target="_blank">PubMedQA</a> - 500 questions constructed from PubMed article titles along with the abstracts as context to test understanding of biomedical research
30
+ <a href="https://proceedings.mlr.press/v174/pal22a.html" target="_blank">MedMCQA</a> - 4183 questions from Indian medical entrance exams (AIIMS & NEET PG) spanning 2.4k healthcare topics
31
+ <a href="https://arxiv.org/abs/2009.03300" target="_blank">MMLU-Clinical knowledge</a> - 265 multiple choice questions on clinical knowledge
32
+ <a href="https://arxiv.org/abs/2009.03300" target="_blank">MMLU-Medical genetics</a> - 100 MCQs on medical genetics
33
+ <a href="https://arxiv.org/abs/2009.03300" target="_blank">MMLU-Anatomy</a> - 135 anatomy MCQs
34
+ <a href="https://arxiv.org/abs/2009.03300" target="_blank">MMLU-Professional medicine</a> - 272 MCQs on professional medicine
35
+ <a href="https://arxiv.org/abs/2009.03300" target="_blank">MMLU-College biology</a> - 144 MCQs on college-level biology
36
+ <a href="https://arxiv.org/abs/2009.03300" target="_blank">MMLU-College medicine</a> - 173 college medicine MCQs
37
+ Metric
38
+ Accuracy (ACC) is used as the main evaluation metric across all datasets
39
+ Details and logs
40
+ Detailed results are available in the results directory: https://huggingface.co/spaces/openlifescienceai/open_medical_llm_leaderboard/tree/main/results
41
+ Input/outputs for each model can be found in the details page accessible by clicking the 📄 emoji next to the model name
42
+ Reproducibility
43
+ To reproduce the results, you can run this evaluation script: python eval_medical_llm.py.
44
+
45
+ To evaluate a specific dataset on a model, use the EleutherAI LLM Evaluation Harness:
46
+
47
+ python main.py --model=hf-auto --model_args="pretrained=<model>,revision=<revision>,parallelize=True"
48
+ --tasks=<dataset> --num_fewshot=<n_shots> --batch_size=1 --output_path=<output_dir>
49
+
50
+ Note some datasets may require additional setup, refer to the Evaluation Harness documentation. Adjust batch size based on your GPU memory if not using parallelism. Minor variations in results are expected with different batch sizes due to padding.
51
+
52
+ Icons
53
+ {ModelType.PT.to_str(" : ")} Pre-trained model
54
+ {ModelType.FT.to_str(" : ")} Fine-tuned model
55
+ {ModelType.Unknown.to_str(" : ")} Unknown model type
56
+ Missing icons indicate the model info is not yet added, feel free to open an issue to include it!
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
  """
58
 
59
  FAQ_TEXT = """
60
+ FAQ
61
+ 1) Submitting a model
 
62
  XXX
63
+
64
+ 2) Model results
65
  XXX
66
+
67
+ 3) Editing a submission
68
  XXX
69
  """
70
 
71
  EVALUATION_QUEUE_TEXT = """
72
+
73
+ Evaluation Queue for the Open Medical LLM Leaderboard
74
+ Models added here will be automatically evaluated.
75
+
76
+ Before submitting a model
77
+ 1) Verify loading with AutoClasses:
78
+ python
79
+
80
+
81
+ Copy code
82
+ from transformers import AutoConfig, AutoModel, AutoTokenizer
83
+ config = AutoConfig.from_pretrained("model-name", revision=revision)
84
+ model = AutoModel.from_pretrained("model-name", revision=revision)
85
+ tokenizer = AutoTokenizer.from_pretrained("model-name", revision=revision)
86
+ Debug any loading errors before submission. Make sure the model is public.
87
+
88
+ Note: Models that require use_remote_code=True are not yet supported.
89
+
90
+ 2) Convert weights to safetensors
91
+ This allows faster loading and enables showing model parameters in the Extended Viewer.
92
+
93
+ 3) Select correct precision
94
+ Incorrect precision (e.g. loading bf16 as fp16) can cause NaN errors for some models.
95
+
96
+ Debugging failing models
97
+ For models in FAILED status, first ensure the above checks are done.
98
+
99
+ Then test running the Eleuther AI Harness locally using the command in the "Reproducibility" section, specifying all arguments. Add --limit to evaluate on fewer examples per task.
100
  """
101
 
102
+ CITATION_BUTTON_LABEL = "Copy the citation snippet"
103
  CITATION_BUTTON_TEXT = r"""
104
+ @misc{openlifescienceai/open_medical_llm_leaderboard,
105
+ author = {Ankit Pal and Pasquale Minervini},
106
+ title = {openlifescienceai/open_medical_llm_leaderboard},
107
+ year = {2024},
108
+ publisher = {Hugging Face},
109
+ howpublished = "\url{https://huggingface.co/spaces/openlifescienceai/open_medical_llm_leaderboard}"
110
  }
111
+ """
src/display/utils.py CHANGED
@@ -18,21 +18,23 @@ class Task:
18
 
19
 
20
  class Tasks(Enum):
21
- # medqa = Task("medqa", "acc_norm", "MedQA") # medqa_4options?
22
- # am i just manually going to include everything? hmm for display, idk how easily do i want to be able to tick this on and off?
23
- # where does the acc_norm come from
24
- medmcqa = Task("medmcqa", "acc_norm", "MedMCQA")
 
 
 
 
 
25
  pubmedqa = Task("pubmedqa", "acc", "PubMedQA")
26
- # task2 = Task("pubmedqa_no_context", "PubMedQA_no_context", 0)
27
- pubmedqa_no_context = Task(
28
- "pubmedqa_no_context", "acc", "PubMedQA_no_context"
29
- ) # adding this throws an error. -> value=leaderboard_df[
30
- biolama_umls = Task("biolama_umls", "acc", "BioLAMA-UMLS")
31
 
32
 
33
  # These classes are for user facing column names,
34
  # to avoid having to change them all around the code
35
  # when a modif is needed
 
 
36
  @dataclass
37
  class ColumnContent:
38
  name: str
 
18
 
19
 
20
  class Tasks(Enum):
21
+ medmcqa = Task("medmcqa", "acc", "MedMCQA")
22
+ medqa = Task("medqa_4options", "acc", "MedQA")
23
+
24
+ mmlu_anatomy = Task("anatomy (mmlu)", "acc", "MMLU Anatomy")
25
+ mmlu_ck = Task("clinical_knowledge (mmlu)", "acc", "MMLU Clinical Knowledge")
26
+ mmlu_cb = Task("college_biology (mmlu)", "acc", "MMLU College Biology")
27
+ mmlu_cm = Task("college_medicine (mmlu)", "acc", "MMLU College Medicine")
28
+ mmlu_mg = Task("medical_genetics (mmlu)", "acc", "MMLU Medical Genetics")
29
+ mmlu_pm = Task("professional_medicine (mmlu)", "acc", "MMLU Professional Medicine")
30
  pubmedqa = Task("pubmedqa", "acc", "PubMedQA")
 
 
 
 
 
31
 
32
 
33
  # These classes are for user facing column names,
34
  # to avoid having to change them all around the code
35
  # when a modif is needed
36
+
37
+
38
  @dataclass
39
  class ColumnContent:
40
  name: str