Spaces:
Runtime error
Runtime error
magilogi
commited on
Commit
β’
4c59875
1
Parent(s):
cfb403c
rabbits-leaderboard-v0.1
Browse files- app.py +163 -0
- custom.css +62 -0
- data/csv/models_data.csv +20 -0
- data/raw-eval-outputs/01-ai-Yi-1.5-34B_results.json +252 -0
- data/raw-eval-outputs/CohereForAI-aya-23-35B_results.json +250 -0
- data/raw-eval-outputs/CohereForAI-c4ai-command-r-plus_results.json +250 -0
- data/raw-eval-outputs/ProbeMedicalYonseiMAILab-medllama3-v20_results.json +252 -0
- data/raw-eval-outputs/Qwen-Qwen2-72B_results.json +250 -0
- data/raw-eval-outputs/Qwen-Qwen2-7B_results.json +316 -0
- data/raw-eval-outputs/aaditya-Llama3-OpenBioLLM-70B_results.json +252 -0
- data/raw-eval-outputs/johnsnowlabs-JSL-MedLlama-3-8B-v9_results.json +252 -0
- data/raw-eval-outputs/meta-llama-Llama-2-70B-hf_results.json +250 -0
- data/raw-eval-outputs/meta-llama-Llama-2-7b-hf_results.json +250 -0
- data/raw-eval-outputs/meta-llama-Meta-Llama-3-70B_results.json +250 -0
- data/raw-eval-outputs/meta-llama-Meta-Llama-3-8B_results.json +250 -0
- data/raw-eval-outputs/microsoft-Phi-3-medium-4k-instruct_results.json +316 -0
- data/raw-eval-outputs/microsoft-phi-1_5_results.json +316 -0
- data/raw-eval-outputs/microsoft-phi-1_results.json +316 -0
- data/raw-eval-outputs/microsoft-phi-2_results.json +316 -0
- data/raw-eval-outputs/mistralai-Mistral-7B-v0.3_results.json +316 -0
- data/raw-eval-outputs/mistralai-Mixtral-8x22B-v0.1_results.json +252 -0
- data/raw-eval-outputs/mistralai-Mixtral-8x7B-v0.1_results.json +250 -0
- src/__pycache__/model_links.cpython-311.pyc +0 -0
- src/__pycache__/models_info.cpython-311.pyc +0 -0
- src/json2df.py +67 -0
- src/models_info.py +79 -0
app.py
ADDED
@@ -0,0 +1,163 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import gradio as gr
|
3 |
+
import plotly.express as px
|
4 |
+
import plotly.graph_objects as go
|
5 |
+
|
6 |
+
df = pd.read_csv("data/csv/models_data.csv")
|
7 |
+
|
8 |
+
|
9 |
+
filter_mapping = {
|
10 |
+
"all": "all",
|
11 |
+
"π’ Pre-trained": "π’",
|
12 |
+
"π© Continuously pre-trained": "π©",
|
13 |
+
"πΆ Fine-tuned on domain-specific data": "πΆ",
|
14 |
+
"π¬ Chat-models (RLHF, DPO, IFT, ...)": "π¬"
|
15 |
+
}
|
16 |
+
|
17 |
+
|
18 |
+
def filter_items(df, query):
|
19 |
+
if query == "all":
|
20 |
+
return df
|
21 |
+
filter_value = filter_mapping[query]
|
22 |
+
return df[df["T"].str.contains(filter_value, na=False)]
|
23 |
+
|
24 |
+
|
25 |
+
def create_scatter_plot(df, x_col, y_col, title, x_title, y_title):
|
26 |
+
fig = px.scatter(df, x=x_col, y=y_col, color='Model', title=title)
|
27 |
+
fig.add_trace(
|
28 |
+
go.Scatter(
|
29 |
+
x=[0, 100],
|
30 |
+
y=[0, 100],
|
31 |
+
mode="lines",
|
32 |
+
name="y=x line",
|
33 |
+
line=dict(color='black', dash='dash')
|
34 |
+
)
|
35 |
+
)
|
36 |
+
|
37 |
+
fig.update_layout(
|
38 |
+
xaxis_title=x_title,
|
39 |
+
yaxis_title=y_title,
|
40 |
+
xaxis=dict(range=[0, 100]),
|
41 |
+
yaxis=dict(range=[0, 100]),
|
42 |
+
legend_title_text='Model'
|
43 |
+
)
|
44 |
+
fig.update_traces(marker=dict(size=10), selector=dict(mode='markers'))
|
45 |
+
return fig
|
46 |
+
|
47 |
+
|
48 |
+
with gr.Blocks(css="custom.css") as demo:
|
49 |
+
with gr.Row():
|
50 |
+
gr.Markdown(
|
51 |
+
"""<div style="text-align: center;"><h1> <span style='color: #6aa84f;'>π° RABBITS:</span> <span style='color: #6aa84f;'>R</span>obust <span style='color: #6aa84f;'>A</span>ssessment of <span style='color: #6aa84f;'>B</span>iomedical <span style='color: #6aa84f;'>B</span>enchmarks <span style='color: #6aa84f;'>I</span>nvolving drug
|
52 |
+
<span style='color: #6aa84f;'>T</span>erm <span style='color: #6aa84f;'>S</span>ubstitutions for Language Models <span style='color: #6aa84f;'></span></h1></div>\
|
53 |
+
<br>\
|
54 |
+
<p class='markdown-text'>Robust language models are crucial in the medical domain and the RABBITS project tests the robustness of LLMs by evaluating their handling of synonyms, specifically brand and generic drug names. We assessed 16 open-source language models from Hugging Face using systematic synonym substitution on MedQA and MedMCQA tasks. Our results show a consistent decline in performance across all model sizes, highlighting challenges in synonym comprehension. Additionally, we discovered significant dataset contamination by identifying overlaps between MedQA, MedMCQA test sets, and the Dolma 1.6 dataset using an 8-gram analysis. This highlights the need to improve model robustness and address contamination in open-source datasets</p>"""
|
55 |
+
)
|
56 |
+
|
57 |
+
with gr.Tabs(elem_classes="tab-buttons"):
|
58 |
+
with gr.TabItem("π Evaluation table"):
|
59 |
+
with gr.Column():
|
60 |
+
with gr.Accordion("β‘οΈ Filter by Column", open=False):
|
61 |
+
shown_columns = gr.CheckboxGroup(
|
62 |
+
choices=df.columns.tolist(),
|
63 |
+
value=df.columns.tolist(),
|
64 |
+
label="Select Columns",
|
65 |
+
interactive=True,
|
66 |
+
)
|
67 |
+
with gr.Row():
|
68 |
+
search_bar = gr.Textbox(
|
69 |
+
placeholder="π Search for your model and press ENTER...",
|
70 |
+
show_label=False,
|
71 |
+
elem_id="search-bar"
|
72 |
+
)
|
73 |
+
filter_columns = gr.Radio(
|
74 |
+
label="β Filter model types",
|
75 |
+
choices=[
|
76 |
+
"all",
|
77 |
+
"π’ Pre-trained",
|
78 |
+
"π© Continuously pre-trained",
|
79 |
+
"πΆ Fine-tuned on domain-specific data",
|
80 |
+
"π¬ Chat-models (RLHF, DPO, IFT, ...)"
|
81 |
+
],
|
82 |
+
value="all",
|
83 |
+
elem_id="filter-columns",
|
84 |
+
)
|
85 |
+
leaderboard_df = gr.Dataframe(
|
86 |
+
value=df,
|
87 |
+
headers="keys",
|
88 |
+
datatype=["html" if col == "Model" else "str" for col in df.columns],
|
89 |
+
interactive=False,
|
90 |
+
elem_id="leaderboard-table"
|
91 |
+
)
|
92 |
+
|
93 |
+
def update_leaderboard(search_query):
|
94 |
+
filtered_df = df[df["Model"].str.contains(search_query, case=False)]
|
95 |
+
return filtered_df
|
96 |
+
|
97 |
+
search_bar.submit(
|
98 |
+
update_leaderboard,
|
99 |
+
inputs=search_bar,
|
100 |
+
outputs=leaderboard_df
|
101 |
+
)
|
102 |
+
|
103 |
+
def filter_update(query):
|
104 |
+
filtered_df = filter_items(df, query)
|
105 |
+
return filtered_df
|
106 |
+
|
107 |
+
filter_columns.change(
|
108 |
+
filter_update,
|
109 |
+
inputs=filter_columns,
|
110 |
+
outputs=leaderboard_df
|
111 |
+
)
|
112 |
+
|
113 |
+
shown_columns.change(
|
114 |
+
lambda cols: df[cols],
|
115 |
+
inputs=shown_columns,
|
116 |
+
outputs=leaderboard_df
|
117 |
+
)
|
118 |
+
|
119 |
+
with gr.TabItem("π Evaluation Plots"):
|
120 |
+
with gr.Column():
|
121 |
+
with gr.Row():
|
122 |
+
scatter1 = gr.Plot(
|
123 |
+
value=create_scatter_plot(df, "medmcqa_orig_filtered", "medmcqa_g2b",
|
124 |
+
"MedMCQA: Orig vs G2B", "medmcqa_orig_filtered", "medmcqa_g2b"),
|
125 |
+
elem_id="scatter1"
|
126 |
+
)
|
127 |
+
scatter2 = gr.Plot(
|
128 |
+
value=create_scatter_plot(df, "medqa_4options_orig_filtered", "medqa_4options_g2b",
|
129 |
+
"MedQA: Orig vs G2B", "medqa_4options_orig_filtered", "medqa_4options_g2b"),
|
130 |
+
elem_id="scatter2"
|
131 |
+
)
|
132 |
+
with gr.Row():
|
133 |
+
scatter3 = gr.Plot(
|
134 |
+
value=create_scatter_plot(df, "b4bqa", "b4b",
|
135 |
+
"b4bqa vs b4b", "b4bqa", "b4b"),
|
136 |
+
elem_id="scatter3"
|
137 |
+
)
|
138 |
+
|
139 |
+
with gr.TabItem("π About"):
|
140 |
+
gr.Markdown(
|
141 |
+
"""<div style="text-align: center;">
|
142 |
+
<h2>About RABBITS LLM Leaderboard</h2>
|
143 |
+
<p>This leaderboard ...</p>
|
144 |
+
<p>It is designed to ...</p>
|
145 |
+
</div>""",
|
146 |
+
elem_classes="markdown-text"
|
147 |
+
)
|
148 |
+
|
149 |
+
with gr.TabItem("π Submit Here!"):
|
150 |
+
gr.Markdown(
|
151 |
+
"""<div style="text-align: center;">
|
152 |
+
<h2>Submit Your Model Results</h2>
|
153 |
+
<p>If you have new model results that you would like to add to the leaderboard, please follow the submission guidelines below:</p>
|
154 |
+
<ul>
|
155 |
+
<li>COMING SOON</li>
|
156 |
+
</ul>
|
157 |
+
<p>COMING SOON</p>
|
158 |
+
</div>""",
|
159 |
+
elem_classes="markdown-text"
|
160 |
+
)
|
161 |
+
|
162 |
+
if __name__ == "__main__":
|
163 |
+
demo.launch()
|
custom.css
ADDED
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#changelog-text {
|
2 |
+
font-size: 16px !important;
|
3 |
+
}
|
4 |
+
#changelog-text h2 {
|
5 |
+
font-size: 18px !important;
|
6 |
+
}
|
7 |
+
.markdown-text {
|
8 |
+
font-size: 16px !important;
|
9 |
+
}
|
10 |
+
#models-to-add-text {
|
11 |
+
font-size: 18px !important;
|
12 |
+
}
|
13 |
+
#citation-button span {
|
14 |
+
font-size: 16px !important;
|
15 |
+
}
|
16 |
+
#citation-button textarea {
|
17 |
+
font-size: 16px !important;
|
18 |
+
}
|
19 |
+
#citation-button > label > button {
|
20 |
+
margin: 6px;
|
21 |
+
transform: scale(1.3);
|
22 |
+
}
|
23 |
+
#leaderboard-table {
|
24 |
+
margin-top: 15px
|
25 |
+
}
|
26 |
+
#leaderboard-table-lite {
|
27 |
+
margin-top: 15px
|
28 |
+
}
|
29 |
+
#search-bar-table-box > div:first-child {
|
30 |
+
background: none;
|
31 |
+
border: none;
|
32 |
+
}
|
33 |
+
|
34 |
+
#search-bar {
|
35 |
+
padding: 0px;
|
36 |
+
}
|
37 |
+
/* Hides the final AutoEvalColumn */
|
38 |
+
#llm-benchmark-tab-table table td:last-child,
|
39 |
+
#llm-benchmark-tab-table table th:last-child {
|
40 |
+
display: none;
|
41 |
+
}
|
42 |
+
/* Limit the width of the first AutoEvalColumn so that names don't expand too much */
|
43 |
+
table td:first-child,
|
44 |
+
table th:first-child {
|
45 |
+
max-width: 400px;
|
46 |
+
overflow: auto;
|
47 |
+
white-space: nowrap;
|
48 |
+
}
|
49 |
+
.tab-buttons button {
|
50 |
+
font-size: 20px;
|
51 |
+
}
|
52 |
+
#scale-logo {
|
53 |
+
border-style: none !important;
|
54 |
+
box-shadow: none;
|
55 |
+
display: block;
|
56 |
+
margin-left: auto;
|
57 |
+
margin-right: auto;
|
58 |
+
max-width: 600px;
|
59 |
+
}
|
60 |
+
#scale-logo .download {
|
61 |
+
display: none;
|
62 |
+
}
|
data/csv/models_data.csv
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
T,Model,b4bqa,b4b,medmcqa_g2b,medmcqa_orig_filtered,medmcqa_diff,medqa_4options_g2b,medqa_4options_orig_filtered,medqa_diff
|
2 |
+
πΆ,"<a target=""_blank"" href=""https://huggingface.co/01-ai/Yi-1.5-34B"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">01-ai-Yi-1.5-34B</a>",85.16,75.37,59.77,69.25,-9.48,59.79,64.55,-4.76
|
3 |
+
πΆ,"<a target=""_blank"" href=""https://huggingface.co/aaditya/Llama3-OpenBioLLM-70B"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">aaditya-Llama3-OpenBioLLM-70B</a>",85.1,78.76,63.22,73.85,-10.63,70.9,75.4,-4.5
|
4 |
+
πΆ,"<a target=""_blank"" href=""https://huggingface.co/CohereForAI/aya-23-35B"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">CohereForAI-aya-23-35B</a>",78.4,65.72,48.56,52.87,-4.31,47.88,51.06,-3.18
|
5 |
+
π¬,"<a target=""_blank"" href=""https://huggingface.co/CohereForAI/c4ai-command-r-plus"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">CohereForAI-c4ai-command-r-plus</a>",84.93,72.41,49.14,61.49,-12.35,56.61,60.32,-3.71
|
6 |
+
πΆ,"<a target=""_blank"" href=""https://huggingface.co/johnsnowlabs/JSL-MedLlama-3-8B-v9"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">johnsnowlabs-JSL-MedLlama-3-8B-v9</a>",75.17,74.45,64.08,77.01,-12.93,70.63,82.01,-11.38
|
7 |
+
π’,"<a target=""_blank"" href=""https://huggingface.co/meta-llama/Llama-2-70B-hf"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">meta-llama-Llama-2-70B-hf</a>",77.01,65.63,45.98,52.3,-6.32,52.65,55.03,-2.38
|
8 |
+
π’,"<a target=""_blank"" href=""https://huggingface.co/meta-llama/Llama-2-7b-hf"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">meta-llama-Llama-2-7b-hf</a>",36.83,36.0,33.91,34.2,-0.29,34.39,37.3,-2.91
|
9 |
+
π’,"<a target=""_blank"" href=""https://huggingface.co/meta-llama/Meta-Llama-3-70B"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">meta-llama-Meta-Llama-3-70B</a>",90.12,82.55,66.67,78.16,-11.49,72.75,75.13,-2.38
|
10 |
+
π’,"<a target=""_blank"" href=""https://huggingface.co/meta-llama/Meta-Llama-3-8B"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">meta-llama-Meta-Llama-3-8B</a>",82.7,71.21,52.87,59.2,-6.33,55.03,60.85,-5.82
|
11 |
+
π’,"<a target=""_blank"" href=""https://huggingface.co/microsoft/phi-1_5"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">microsoft-phi-1_5</a>",28.01,30.24,31.61,30.46,1.15,34.92,34.66,0.26
|
12 |
+
π’,"<a target=""_blank"" href=""https://huggingface.co/microsoft/phi-1"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">microsoft-phi-1</a>",19.64,21.18,24.14,25.86,-1.72,21.69,20.9,0.79
|
13 |
+
π’,"<a target=""_blank"" href=""https://huggingface.co/microsoft/phi-2"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">microsoft-phi-2</a>",47.49,44.79,37.64,42.24,-4.6,41.8,43.92,-2.12
|
14 |
+
π¬,"<a target=""_blank"" href=""https://huggingface.co/microsoft/Phi-3-medium-4k-instruct"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">microsoft-Phi-3-medium-4k-instruct</a>",69.98,65.94,60.34,72.41,-12.07,53.44,58.47,-5.03
|
15 |
+
π©,"<a target=""_blank"" href=""https://huggingface.co/mistralai/Mistral-7B-v0.3"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">mistralai-Mistral-7B-v0.3</a>",70.31,61.99,48.28,56.9,-8.62,48.68,53.17,-4.49
|
16 |
+
π©,"<a target=""_blank"" href=""https://huggingface.co/mistralai/Mixtral-8x22B-v0.1"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">mistralai-Mixtral-8x22B-v0.1</a>",87.72,78.82,61.78,70.4,-8.62,67.46,71.43,-3.97
|
17 |
+
π©,"<a target=""_blank"" href=""https://huggingface.co/mistralai/Mixtral-8x7B-v0.1"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">mistralai-Mixtral-8x7B-v0.1</a>",86.1,74.75,55.46,64.94,-9.48,60.05,62.43,-2.38
|
18 |
+
πΆ,"<a target=""_blank"" href=""https://huggingface.co/ProbeMedicalYonseiMAILab/medllama3-v20"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">ProbeMedicalYonseiMAILab-medllama3-v20</a>",71.93,74.75,65.23,80.17,-14.94,76.46,90.21,-13.75
|
19 |
+
π’,"<a target=""_blank"" href=""https://huggingface.co/Qwen/Qwen2-72B"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">Qwen-Qwen2-72B</a>",91.02,83.72,71.55,77.87,-6.32,74.07,75.4,-1.33
|
20 |
+
π’,"<a target=""_blank"" href=""https://huggingface.co/Qwen/Qwen2-7B"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">Qwen-Qwen2-7B</a>",80.41,70.28,55.17,63.51,-8.34,53.7,58.99,-5.29
|
data/raw-eval-outputs/01-ai-Yi-1.5-34B_results.json
ADDED
@@ -0,0 +1,252 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"b4b": {
|
4 |
+
"acc,none": 0.7536991368680641,
|
5 |
+
"acc_stderr,none": 0.09728135187806679,
|
6 |
+
"acc_norm,none": 0.7536991368680641,
|
7 |
+
"acc_norm_stderr,none": 0.09728135187806679,
|
8 |
+
"alias": "b4b"
|
9 |
+
},
|
10 |
+
"b4bqa": {
|
11 |
+
"acc,none": 0.8515625,
|
12 |
+
"acc_stderr,none": 0.008401025189152976,
|
13 |
+
"acc_norm,none": 0.8515625,
|
14 |
+
"acc_norm_stderr,none": 0.008401025189152976,
|
15 |
+
"alias": " - b4bqa"
|
16 |
+
},
|
17 |
+
"medmcqa_g2b": {
|
18 |
+
"acc,none": 0.5977011494252874,
|
19 |
+
"acc_stderr,none": 0.026323989201783506,
|
20 |
+
"acc_norm,none": 0.5977011494252874,
|
21 |
+
"acc_norm_stderr,none": 0.026323989201783506,
|
22 |
+
"alias": " - medmcqa_g2b"
|
23 |
+
},
|
24 |
+
"medmcqa_orig_filtered": {
|
25 |
+
"acc,none": 0.6925287356321839,
|
26 |
+
"acc_stderr,none": 0.024771735192072118,
|
27 |
+
"acc_norm,none": 0.6925287356321839,
|
28 |
+
"acc_norm_stderr,none": 0.024771735192072118,
|
29 |
+
"alias": " - medmcqa_orig_filtered"
|
30 |
+
},
|
31 |
+
"medqa_4options_g2b": {
|
32 |
+
"acc,none": 0.5978835978835979,
|
33 |
+
"acc_stderr,none": 0.025253032554997695,
|
34 |
+
"acc_norm,none": 0.5978835978835979,
|
35 |
+
"acc_norm_stderr,none": 0.025253032554997695,
|
36 |
+
"alias": " - medqa_4options_g2b"
|
37 |
+
},
|
38 |
+
"medqa_4options_orig_filtered": {
|
39 |
+
"acc,none": 0.6455026455026455,
|
40 |
+
"acc_stderr,none": 0.024636830602842,
|
41 |
+
"acc_norm,none": 0.6455026455026455,
|
42 |
+
"acc_norm_stderr,none": 0.024636830602842,
|
43 |
+
"alias": " - medqa_4options_orig_filtered"
|
44 |
+
}
|
45 |
+
},
|
46 |
+
"groups": {
|
47 |
+
"b4b": {
|
48 |
+
"acc,none": 0.7536991368680641,
|
49 |
+
"acc_stderr,none": 0.09728135187806679,
|
50 |
+
"acc_norm,none": 0.7536991368680641,
|
51 |
+
"acc_norm_stderr,none": 0.09728135187806679,
|
52 |
+
"alias": "b4b"
|
53 |
+
}
|
54 |
+
},
|
55 |
+
"configs": {
|
56 |
+
"b4bqa": {
|
57 |
+
"task": "b4bqa",
|
58 |
+
"dataset_path": "AIM-Harvard/b4b_drug_qa",
|
59 |
+
"test_split": "test",
|
60 |
+
"doc_to_text": "<function process_cd at 0x7f7ab0e88700>",
|
61 |
+
"doc_to_target": "correct_choice",
|
62 |
+
"doc_to_choice": [
|
63 |
+
"A",
|
64 |
+
"B",
|
65 |
+
"C",
|
66 |
+
"D"
|
67 |
+
],
|
68 |
+
"description": "",
|
69 |
+
"target_delimiter": " ",
|
70 |
+
"fewshot_delimiter": "\n\n",
|
71 |
+
"metric_list": [
|
72 |
+
{
|
73 |
+
"metric": "acc",
|
74 |
+
"aggregation": "mean",
|
75 |
+
"higher_is_better": true
|
76 |
+
},
|
77 |
+
{
|
78 |
+
"metric": "acc_norm",
|
79 |
+
"aggregation": "mean",
|
80 |
+
"higher_is_better": true
|
81 |
+
}
|
82 |
+
],
|
83 |
+
"output_type": "multiple_choice",
|
84 |
+
"repeats": 1,
|
85 |
+
"should_decontaminate": false
|
86 |
+
},
|
87 |
+
"medmcqa_g2b": {
|
88 |
+
"task": "medmcqa_g2b",
|
89 |
+
"dataset_path": "AIM-Harvard/medmcqa_generic_to_brand",
|
90 |
+
"training_split": "train",
|
91 |
+
"validation_split": "validation",
|
92 |
+
"test_split": "validation",
|
93 |
+
"doc_to_text": "<function doc_to_text at 0x7f7ab0ed1f30>",
|
94 |
+
"doc_to_target": "cop",
|
95 |
+
"doc_to_choice": [
|
96 |
+
"A",
|
97 |
+
"B",
|
98 |
+
"C",
|
99 |
+
"D"
|
100 |
+
],
|
101 |
+
"description": "",
|
102 |
+
"target_delimiter": " ",
|
103 |
+
"fewshot_delimiter": "\n\n",
|
104 |
+
"metric_list": [
|
105 |
+
{
|
106 |
+
"metric": "acc",
|
107 |
+
"aggregation": "mean",
|
108 |
+
"higher_is_better": true
|
109 |
+
},
|
110 |
+
{
|
111 |
+
"metric": "acc_norm",
|
112 |
+
"aggregation": "mean",
|
113 |
+
"higher_is_better": true
|
114 |
+
}
|
115 |
+
],
|
116 |
+
"output_type": "multiple_choice",
|
117 |
+
"repeats": 1,
|
118 |
+
"should_decontaminate": true,
|
119 |
+
"doc_to_decontamination_query": "{{question}}"
|
120 |
+
},
|
121 |
+
"medmcqa_orig_filtered": {
|
122 |
+
"task": "medmcqa_orig_filtered",
|
123 |
+
"dataset_path": "AIM-Harvard/medmcqa_original",
|
124 |
+
"training_split": "train",
|
125 |
+
"validation_split": "validation",
|
126 |
+
"test_split": "validation",
|
127 |
+
"doc_to_text": "<function doc_to_text at 0x7f7aa1ac9120>",
|
128 |
+
"doc_to_target": "cop",
|
129 |
+
"doc_to_choice": [
|
130 |
+
"A",
|
131 |
+
"B",
|
132 |
+
"C",
|
133 |
+
"D"
|
134 |
+
],
|
135 |
+
"description": "",
|
136 |
+
"target_delimiter": " ",
|
137 |
+
"fewshot_delimiter": "\n\n",
|
138 |
+
"metric_list": [
|
139 |
+
{
|
140 |
+
"metric": "acc",
|
141 |
+
"aggregation": "mean",
|
142 |
+
"higher_is_better": true
|
143 |
+
},
|
144 |
+
{
|
145 |
+
"metric": "acc_norm",
|
146 |
+
"aggregation": "mean",
|
147 |
+
"higher_is_better": true
|
148 |
+
}
|
149 |
+
],
|
150 |
+
"output_type": "multiple_choice",
|
151 |
+
"repeats": 1,
|
152 |
+
"should_decontaminate": true,
|
153 |
+
"doc_to_decontamination_query": "{{question}}"
|
154 |
+
},
|
155 |
+
"medqa_4options_g2b": {
|
156 |
+
"task": "medqa_4options_g2b",
|
157 |
+
"dataset_path": "AIM-Harvard/gbaker_medqa_usmle_4_options_hf_generic_to_brand",
|
158 |
+
"training_split": "train",
|
159 |
+
"validation_split": "validation",
|
160 |
+
"test_split": "test",
|
161 |
+
"doc_to_text": "<function doc_to_text at 0x7f7ab0d90700>",
|
162 |
+
"doc_to_target": "<function doc_to_target at 0x7f7ab0d90a60>",
|
163 |
+
"doc_to_choice": [
|
164 |
+
"A",
|
165 |
+
"B",
|
166 |
+
"C",
|
167 |
+
"D"
|
168 |
+
],
|
169 |
+
"description": "",
|
170 |
+
"target_delimiter": " ",
|
171 |
+
"fewshot_delimiter": "\n\n",
|
172 |
+
"metric_list": [
|
173 |
+
{
|
174 |
+
"metric": "acc",
|
175 |
+
"aggregation": "mean",
|
176 |
+
"higher_is_better": true
|
177 |
+
},
|
178 |
+
{
|
179 |
+
"metric": "acc_norm",
|
180 |
+
"aggregation": "mean",
|
181 |
+
"higher_is_better": true
|
182 |
+
}
|
183 |
+
],
|
184 |
+
"output_type": "multiple_choice",
|
185 |
+
"repeats": 1,
|
186 |
+
"should_decontaminate": false
|
187 |
+
},
|
188 |
+
"medqa_4options_orig_filtered": {
|
189 |
+
"task": "medqa_4options_orig_filtered",
|
190 |
+
"dataset_path": "AIM-Harvard/gbaker_medqa_usmle_4_options_hf_original",
|
191 |
+
"training_split": "train",
|
192 |
+
"validation_split": "validation",
|
193 |
+
"test_split": "test",
|
194 |
+
"doc_to_text": "<function doc_to_text at 0x7f7ab289e560>",
|
195 |
+
"doc_to_target": "<function doc_to_target at 0x7f7ab0e6d000>",
|
196 |
+
"doc_to_choice": [
|
197 |
+
"A",
|
198 |
+
"B",
|
199 |
+
"C",
|
200 |
+
"D"
|
201 |
+
],
|
202 |
+
"description": "",
|
203 |
+
"target_delimiter": " ",
|
204 |
+
"fewshot_delimiter": "\n\n",
|
205 |
+
"metric_list": [
|
206 |
+
{
|
207 |
+
"metric": "acc",
|
208 |
+
"aggregation": "mean",
|
209 |
+
"higher_is_better": true
|
210 |
+
},
|
211 |
+
{
|
212 |
+
"metric": "acc_norm",
|
213 |
+
"aggregation": "mean",
|
214 |
+
"higher_is_better": true
|
215 |
+
}
|
216 |
+
],
|
217 |
+
"output_type": "multiple_choice",
|
218 |
+
"repeats": 1,
|
219 |
+
"should_decontaminate": false
|
220 |
+
}
|
221 |
+
},
|
222 |
+
"versions": {
|
223 |
+
"b4b": "N/A",
|
224 |
+
"b4bqa": "Yaml",
|
225 |
+
"medmcqa_g2b": "Yaml",
|
226 |
+
"medmcqa_orig_filtered": "Yaml",
|
227 |
+
"medqa_4options_g2b": "Yaml",
|
228 |
+
"medqa_4options_orig_filtered": "Yaml"
|
229 |
+
},
|
230 |
+
"n-shot": {
|
231 |
+
"b4b": 0,
|
232 |
+
"b4bqa": 0,
|
233 |
+
"medmcqa_g2b": 0,
|
234 |
+
"medmcqa_orig_filtered": 0,
|
235 |
+
"medqa_4options_g2b": 0,
|
236 |
+
"medqa_4options_orig_filtered": 0
|
237 |
+
},
|
238 |
+
"config": {
|
239 |
+
"model": "hf",
|
240 |
+
"model_args": "pretrained=01-ai/Yi-1.5-34B,parallelize=True,load_in_4bit=True",
|
241 |
+
"batch_size": "auto",
|
242 |
+
"batch_sizes": [
|
243 |
+
64
|
244 |
+
],
|
245 |
+
"device": null,
|
246 |
+
"use_cache": null,
|
247 |
+
"limit": null,
|
248 |
+
"bootstrap_iters": 100000,
|
249 |
+
"gen_kwargs": null
|
250 |
+
},
|
251 |
+
"git_hash": "928c7657"
|
252 |
+
}
|
data/raw-eval-outputs/CohereForAI-aya-23-35B_results.json
ADDED
@@ -0,0 +1,250 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"b4b": {
|
4 |
+
"acc,none": 0.657213316892725,
|
5 |
+
"acc_stderr,none": 0.12271990860540663,
|
6 |
+
"acc_norm,none": 0.657213316892725,
|
7 |
+
"acc_norm_stderr,none": 0.12271990860540663,
|
8 |
+
"alias": "b4b"
|
9 |
+
},
|
10 |
+
"b4bqa": {
|
11 |
+
"acc,none": 0.7840401785714286,
|
12 |
+
"acc_stderr,none": 0.009723169269065642,
|
13 |
+
"acc_norm,none": 0.7840401785714286,
|
14 |
+
"acc_norm_stderr,none": 0.009723169269065642,
|
15 |
+
"alias": " - b4bqa"
|
16 |
+
},
|
17 |
+
"medmcqa_g2b": {
|
18 |
+
"acc,none": 0.48563218390804597,
|
19 |
+
"acc_stderr,none": 0.026830322100875627,
|
20 |
+
"acc_norm,none": 0.48563218390804597,
|
21 |
+
"acc_norm_stderr,none": 0.026830322100875627,
|
22 |
+
"alias": " - medmcqa_g2b"
|
23 |
+
},
|
24 |
+
"medmcqa_orig_filtered": {
|
25 |
+
"acc,none": 0.5287356321839081,
|
26 |
+
"acc_stderr,none": 0.026797041830104146,
|
27 |
+
"acc_norm,none": 0.5287356321839081,
|
28 |
+
"acc_norm_stderr,none": 0.026797041830104146,
|
29 |
+
"alias": " - medmcqa_orig_filtered"
|
30 |
+
},
|
31 |
+
"medqa_4options_g2b": {
|
32 |
+
"acc,none": 0.47883597883597884,
|
33 |
+
"acc_stderr,none": 0.025728230952130723,
|
34 |
+
"acc_norm,none": 0.47883597883597884,
|
35 |
+
"acc_norm_stderr,none": 0.025728230952130723,
|
36 |
+
"alias": " - medqa_4options_g2b"
|
37 |
+
},
|
38 |
+
"medqa_4options_orig_filtered": {
|
39 |
+
"acc,none": 0.5105820105820106,
|
40 |
+
"acc_stderr,none": 0.02574554227604548,
|
41 |
+
"acc_norm,none": 0.5105820105820106,
|
42 |
+
"acc_norm_stderr,none": 0.02574554227604548,
|
43 |
+
"alias": " - medqa_4options_orig_filtered"
|
44 |
+
}
|
45 |
+
},
|
46 |
+
"groups": {
|
47 |
+
"b4b": {
|
48 |
+
"acc,none": 0.657213316892725,
|
49 |
+
"acc_stderr,none": 0.12271990860540663,
|
50 |
+
"acc_norm,none": 0.657213316892725,
|
51 |
+
"acc_norm_stderr,none": 0.12271990860540663,
|
52 |
+
"alias": "b4b"
|
53 |
+
}
|
54 |
+
},
|
55 |
+
"configs": {
|
56 |
+
"b4bqa": {
|
57 |
+
"task": "b4bqa",
|
58 |
+
"dataset_path": "AIM-Harvard/b4b_drug_qa",
|
59 |
+
"test_split": "test",
|
60 |
+
"doc_to_text": "<function process_cd at 0x7f77e7a6d090>",
|
61 |
+
"doc_to_target": "correct_choice",
|
62 |
+
"doc_to_choice": [
|
63 |
+
"A",
|
64 |
+
"B",
|
65 |
+
"C",
|
66 |
+
"D"
|
67 |
+
],
|
68 |
+
"description": "",
|
69 |
+
"target_delimiter": " ",
|
70 |
+
"fewshot_delimiter": "\n\n",
|
71 |
+
"metric_list": [
|
72 |
+
{
|
73 |
+
"metric": "acc",
|
74 |
+
"aggregation": "mean",
|
75 |
+
"higher_is_better": true
|
76 |
+
},
|
77 |
+
{
|
78 |
+
"metric": "acc_norm",
|
79 |
+
"aggregation": "mean",
|
80 |
+
"higher_is_better": true
|
81 |
+
}
|
82 |
+
],
|
83 |
+
"output_type": "multiple_choice",
|
84 |
+
"repeats": 1,
|
85 |
+
"should_decontaminate": false
|
86 |
+
},
|
87 |
+
"medmcqa_g2b": {
|
88 |
+
"task": "medmcqa_g2b",
|
89 |
+
"dataset_path": "AIM-Harvard/medmcqa_generic_to_brand",
|
90 |
+
"training_split": "train",
|
91 |
+
"validation_split": "validation",
|
92 |
+
"test_split": "validation",
|
93 |
+
"doc_to_text": "<function doc_to_text at 0x7f77e770c550>",
|
94 |
+
"doc_to_target": "cop",
|
95 |
+
"doc_to_choice": [
|
96 |
+
"A",
|
97 |
+
"B",
|
98 |
+
"C",
|
99 |
+
"D"
|
100 |
+
],
|
101 |
+
"description": "",
|
102 |
+
"target_delimiter": " ",
|
103 |
+
"fewshot_delimiter": "\n\n",
|
104 |
+
"metric_list": [
|
105 |
+
{
|
106 |
+
"metric": "acc",
|
107 |
+
"aggregation": "mean",
|
108 |
+
"higher_is_better": true
|
109 |
+
},
|
110 |
+
{
|
111 |
+
"metric": "acc_norm",
|
112 |
+
"aggregation": "mean",
|
113 |
+
"higher_is_better": true
|
114 |
+
}
|
115 |
+
],
|
116 |
+
"output_type": "multiple_choice",
|
117 |
+
"repeats": 1,
|
118 |
+
"should_decontaminate": true,
|
119 |
+
"doc_to_decontamination_query": "{{question}}"
|
120 |
+
},
|
121 |
+
"medmcqa_orig_filtered": {
|
122 |
+
"task": "medmcqa_orig_filtered",
|
123 |
+
"dataset_path": "AIM-Harvard/medmcqa_original",
|
124 |
+
"training_split": "train",
|
125 |
+
"validation_split": "validation",
|
126 |
+
"test_split": "validation",
|
127 |
+
"doc_to_text": "<function doc_to_text at 0x7f77e770c700>",
|
128 |
+
"doc_to_target": "cop",
|
129 |
+
"doc_to_choice": [
|
130 |
+
"A",
|
131 |
+
"B",
|
132 |
+
"C",
|
133 |
+
"D"
|
134 |
+
],
|
135 |
+
"description": "",
|
136 |
+
"target_delimiter": " ",
|
137 |
+
"fewshot_delimiter": "\n\n",
|
138 |
+
"metric_list": [
|
139 |
+
{
|
140 |
+
"metric": "acc",
|
141 |
+
"aggregation": "mean",
|
142 |
+
"higher_is_better": true
|
143 |
+
},
|
144 |
+
{
|
145 |
+
"metric": "acc_norm",
|
146 |
+
"aggregation": "mean",
|
147 |
+
"higher_is_better": true
|
148 |
+
}
|
149 |
+
],
|
150 |
+
"output_type": "multiple_choice",
|
151 |
+
"repeats": 1,
|
152 |
+
"should_decontaminate": true,
|
153 |
+
"doc_to_decontamination_query": "{{question}}"
|
154 |
+
},
|
155 |
+
"medqa_4options_g2b": {
|
156 |
+
"task": "medqa_4options_g2b",
|
157 |
+
"dataset_path": "AIM-Harvard/gbaker_medqa_usmle_4_options_hf_generic_to_brand",
|
158 |
+
"training_split": "train",
|
159 |
+
"validation_split": "validation",
|
160 |
+
"test_split": "test",
|
161 |
+
"doc_to_text": "<function doc_to_text at 0x7f77e770f6d0>",
|
162 |
+
"doc_to_target": "<function doc_to_target at 0x7f77e770fa30>",
|
163 |
+
"doc_to_choice": [
|
164 |
+
"A",
|
165 |
+
"B",
|
166 |
+
"C",
|
167 |
+
"D"
|
168 |
+
],
|
169 |
+
"description": "",
|
170 |
+
"target_delimiter": " ",
|
171 |
+
"fewshot_delimiter": "\n\n",
|
172 |
+
"metric_list": [
|
173 |
+
{
|
174 |
+
"metric": "acc",
|
175 |
+
"aggregation": "mean",
|
176 |
+
"higher_is_better": true
|
177 |
+
},
|
178 |
+
{
|
179 |
+
"metric": "acc_norm",
|
180 |
+
"aggregation": "mean",
|
181 |
+
"higher_is_better": true
|
182 |
+
}
|
183 |
+
],
|
184 |
+
"output_type": "multiple_choice",
|
185 |
+
"repeats": 1,
|
186 |
+
"should_decontaminate": false
|
187 |
+
},
|
188 |
+
"medqa_4options_orig_filtered": {
|
189 |
+
"task": "medqa_4options_orig_filtered",
|
190 |
+
"dataset_path": "AIM-Harvard/gbaker_medqa_usmle_4_options_hf_original",
|
191 |
+
"training_split": "train",
|
192 |
+
"validation_split": "validation",
|
193 |
+
"test_split": "test",
|
194 |
+
"doc_to_text": "<function doc_to_text at 0x7f77e770fc70>",
|
195 |
+
"doc_to_target": "<function doc_to_target at 0x7f77e770feb0>",
|
196 |
+
"doc_to_choice": [
|
197 |
+
"A",
|
198 |
+
"B",
|
199 |
+
"C",
|
200 |
+
"D"
|
201 |
+
],
|
202 |
+
"description": "",
|
203 |
+
"target_delimiter": " ",
|
204 |
+
"fewshot_delimiter": "\n\n",
|
205 |
+
"metric_list": [
|
206 |
+
{
|
207 |
+
"metric": "acc",
|
208 |
+
"aggregation": "mean",
|
209 |
+
"higher_is_better": true
|
210 |
+
},
|
211 |
+
{
|
212 |
+
"metric": "acc_norm",
|
213 |
+
"aggregation": "mean",
|
214 |
+
"higher_is_better": true
|
215 |
+
}
|
216 |
+
],
|
217 |
+
"output_type": "multiple_choice",
|
218 |
+
"repeats": 1,
|
219 |
+
"should_decontaminate": false
|
220 |
+
}
|
221 |
+
},
|
222 |
+
"versions": {
|
223 |
+
"b4b": "N/A",
|
224 |
+
"b4bqa": "Yaml",
|
225 |
+
"medmcqa_g2b": "Yaml",
|
226 |
+
"medmcqa_orig_filtered": "Yaml",
|
227 |
+
"medqa_4options_g2b": "Yaml",
|
228 |
+
"medqa_4options_orig_filtered": "Yaml"
|
229 |
+
},
|
230 |
+
"n-shot": {
|
231 |
+
"b4b": 0,
|
232 |
+
"b4bqa": 0,
|
233 |
+
"medmcqa_g2b": 0,
|
234 |
+
"medmcqa_orig_filtered": 0,
|
235 |
+
"medqa_4options_g2b": 0,
|
236 |
+
"medqa_4options_orig_filtered": 0
|
237 |
+
},
|
238 |
+
"config": {
|
239 |
+
"model": "hf",
|
240 |
+
"model_args": "pretrained=CohereForAI/aya-23-35B,load_in_4bit=True",
|
241 |
+
"batch_size": "4",
|
242 |
+
"batch_sizes": [],
|
243 |
+
"device": "cuda:0",
|
244 |
+
"use_cache": null,
|
245 |
+
"limit": null,
|
246 |
+
"bootstrap_iters": 100000,
|
247 |
+
"gen_kwargs": null
|
248 |
+
},
|
249 |
+
"git_hash": "928c7657"
|
250 |
+
}
|
data/raw-eval-outputs/CohereForAI-c4ai-command-r-plus_results.json
ADDED
@@ -0,0 +1,250 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"b4b": {
|
4 |
+
"acc,none": 0.7241060419235512,
|
5 |
+
"acc_stderr,none": 0.12287593035527263,
|
6 |
+
"acc_norm,none": 0.7241060419235512,
|
7 |
+
"acc_norm_stderr,none": 0.12287593035527263,
|
8 |
+
"alias": "b4b"
|
9 |
+
},
|
10 |
+
"b4bqa": {
|
11 |
+
"acc,none": 0.8493303571428571,
|
12 |
+
"acc_stderr,none": 0.00845285482249418,
|
13 |
+
"acc_norm,none": 0.8493303571428571,
|
14 |
+
"acc_norm_stderr,none": 0.00845285482249418,
|
15 |
+
"alias": " - b4bqa"
|
16 |
+
},
|
17 |
+
"medmcqa_g2b": {
|
18 |
+
"acc,none": 0.49137931034482757,
|
19 |
+
"acc_stderr,none": 0.026837416550737143,
|
20 |
+
"acc_norm,none": 0.49137931034482757,
|
21 |
+
"acc_norm_stderr,none": 0.026837416550737143,
|
22 |
+
"alias": " - medmcqa_g2b"
|
23 |
+
},
|
24 |
+
"medmcqa_orig_filtered": {
|
25 |
+
"acc,none": 0.6149425287356322,
|
26 |
+
"acc_stderr,none": 0.026122534084516178,
|
27 |
+
"acc_norm,none": 0.6149425287356322,
|
28 |
+
"acc_norm_stderr,none": 0.026122534084516178,
|
29 |
+
"alias": " - medmcqa_orig_filtered"
|
30 |
+
},
|
31 |
+
"medqa_4options_g2b": {
|
32 |
+
"acc,none": 0.5661375661375662,
|
33 |
+
"acc_stderr,none": 0.0255250343824749,
|
34 |
+
"acc_norm,none": 0.5661375661375662,
|
35 |
+
"acc_norm_stderr,none": 0.0255250343824749,
|
36 |
+
"alias": " - medqa_4options_g2b"
|
37 |
+
},
|
38 |
+
"medqa_4options_orig_filtered": {
|
39 |
+
"acc,none": 0.6031746031746031,
|
40 |
+
"acc_stderr,none": 0.025197101074246483,
|
41 |
+
"acc_norm,none": 0.6031746031746031,
|
42 |
+
"acc_norm_stderr,none": 0.025197101074246483,
|
43 |
+
"alias": " - medqa_4options_orig_filtered"
|
44 |
+
}
|
45 |
+
},
|
46 |
+
"groups": {
|
47 |
+
"b4b": {
|
48 |
+
"acc,none": 0.7241060419235512,
|
49 |
+
"acc_stderr,none": 0.12287593035527263,
|
50 |
+
"acc_norm,none": 0.7241060419235512,
|
51 |
+
"acc_norm_stderr,none": 0.12287593035527263,
|
52 |
+
"alias": "b4b"
|
53 |
+
}
|
54 |
+
},
|
55 |
+
"configs": {
|
56 |
+
"b4bqa": {
|
57 |
+
"task": "b4bqa",
|
58 |
+
"dataset_path": "AIM-Harvard/b4b_drug_qa",
|
59 |
+
"test_split": "test",
|
60 |
+
"doc_to_text": "<function process_cd at 0x7f6d9dc51090>",
|
61 |
+
"doc_to_target": "correct_choice",
|
62 |
+
"doc_to_choice": [
|
63 |
+
"A",
|
64 |
+
"B",
|
65 |
+
"C",
|
66 |
+
"D"
|
67 |
+
],
|
68 |
+
"description": "",
|
69 |
+
"target_delimiter": " ",
|
70 |
+
"fewshot_delimiter": "\n\n",
|
71 |
+
"metric_list": [
|
72 |
+
{
|
73 |
+
"metric": "acc",
|
74 |
+
"aggregation": "mean",
|
75 |
+
"higher_is_better": true
|
76 |
+
},
|
77 |
+
{
|
78 |
+
"metric": "acc_norm",
|
79 |
+
"aggregation": "mean",
|
80 |
+
"higher_is_better": true
|
81 |
+
}
|
82 |
+
],
|
83 |
+
"output_type": "multiple_choice",
|
84 |
+
"repeats": 1,
|
85 |
+
"should_decontaminate": false
|
86 |
+
},
|
87 |
+
"medmcqa_g2b": {
|
88 |
+
"task": "medmcqa_g2b",
|
89 |
+
"dataset_path": "AIM-Harvard/medmcqa_generic_to_brand",
|
90 |
+
"training_split": "train",
|
91 |
+
"validation_split": "validation",
|
92 |
+
"test_split": "validation",
|
93 |
+
"doc_to_text": "<function doc_to_text at 0x7f6d9d85c550>",
|
94 |
+
"doc_to_target": "cop",
|
95 |
+
"doc_to_choice": [
|
96 |
+
"A",
|
97 |
+
"B",
|
98 |
+
"C",
|
99 |
+
"D"
|
100 |
+
],
|
101 |
+
"description": "",
|
102 |
+
"target_delimiter": " ",
|
103 |
+
"fewshot_delimiter": "\n\n",
|
104 |
+
"metric_list": [
|
105 |
+
{
|
106 |
+
"metric": "acc",
|
107 |
+
"aggregation": "mean",
|
108 |
+
"higher_is_better": true
|
109 |
+
},
|
110 |
+
{
|
111 |
+
"metric": "acc_norm",
|
112 |
+
"aggregation": "mean",
|
113 |
+
"higher_is_better": true
|
114 |
+
}
|
115 |
+
],
|
116 |
+
"output_type": "multiple_choice",
|
117 |
+
"repeats": 1,
|
118 |
+
"should_decontaminate": true,
|
119 |
+
"doc_to_decontamination_query": "{{question}}"
|
120 |
+
},
|
121 |
+
"medmcqa_orig_filtered": {
|
122 |
+
"task": "medmcqa_orig_filtered",
|
123 |
+
"dataset_path": "AIM-Harvard/medmcqa_original",
|
124 |
+
"training_split": "train",
|
125 |
+
"validation_split": "validation",
|
126 |
+
"test_split": "validation",
|
127 |
+
"doc_to_text": "<function doc_to_text at 0x7f6d9d85c700>",
|
128 |
+
"doc_to_target": "cop",
|
129 |
+
"doc_to_choice": [
|
130 |
+
"A",
|
131 |
+
"B",
|
132 |
+
"C",
|
133 |
+
"D"
|
134 |
+
],
|
135 |
+
"description": "",
|
136 |
+
"target_delimiter": " ",
|
137 |
+
"fewshot_delimiter": "\n\n",
|
138 |
+
"metric_list": [
|
139 |
+
{
|
140 |
+
"metric": "acc",
|
141 |
+
"aggregation": "mean",
|
142 |
+
"higher_is_better": true
|
143 |
+
},
|
144 |
+
{
|
145 |
+
"metric": "acc_norm",
|
146 |
+
"aggregation": "mean",
|
147 |
+
"higher_is_better": true
|
148 |
+
}
|
149 |
+
],
|
150 |
+
"output_type": "multiple_choice",
|
151 |
+
"repeats": 1,
|
152 |
+
"should_decontaminate": true,
|
153 |
+
"doc_to_decontamination_query": "{{question}}"
|
154 |
+
},
|
155 |
+
"medqa_4options_g2b": {
|
156 |
+
"task": "medqa_4options_g2b",
|
157 |
+
"dataset_path": "AIM-Harvard/gbaker_medqa_usmle_4_options_hf_generic_to_brand",
|
158 |
+
"training_split": "train",
|
159 |
+
"validation_split": "validation",
|
160 |
+
"test_split": "test",
|
161 |
+
"doc_to_text": "<function doc_to_text at 0x7f6d9d85f6d0>",
|
162 |
+
"doc_to_target": "<function doc_to_target at 0x7f6d9d85fa30>",
|
163 |
+
"doc_to_choice": [
|
164 |
+
"A",
|
165 |
+
"B",
|
166 |
+
"C",
|
167 |
+
"D"
|
168 |
+
],
|
169 |
+
"description": "",
|
170 |
+
"target_delimiter": " ",
|
171 |
+
"fewshot_delimiter": "\n\n",
|
172 |
+
"metric_list": [
|
173 |
+
{
|
174 |
+
"metric": "acc",
|
175 |
+
"aggregation": "mean",
|
176 |
+
"higher_is_better": true
|
177 |
+
},
|
178 |
+
{
|
179 |
+
"metric": "acc_norm",
|
180 |
+
"aggregation": "mean",
|
181 |
+
"higher_is_better": true
|
182 |
+
}
|
183 |
+
],
|
184 |
+
"output_type": "multiple_choice",
|
185 |
+
"repeats": 1,
|
186 |
+
"should_decontaminate": false
|
187 |
+
},
|
188 |
+
"medqa_4options_orig_filtered": {
|
189 |
+
"task": "medqa_4options_orig_filtered",
|
190 |
+
"dataset_path": "AIM-Harvard/gbaker_medqa_usmle_4_options_hf_original",
|
191 |
+
"training_split": "train",
|
192 |
+
"validation_split": "validation",
|
193 |
+
"test_split": "test",
|
194 |
+
"doc_to_text": "<function doc_to_text at 0x7f6d9d85fc70>",
|
195 |
+
"doc_to_target": "<function doc_to_target at 0x7f6d9d85feb0>",
|
196 |
+
"doc_to_choice": [
|
197 |
+
"A",
|
198 |
+
"B",
|
199 |
+
"C",
|
200 |
+
"D"
|
201 |
+
],
|
202 |
+
"description": "",
|
203 |
+
"target_delimiter": " ",
|
204 |
+
"fewshot_delimiter": "\n\n",
|
205 |
+
"metric_list": [
|
206 |
+
{
|
207 |
+
"metric": "acc",
|
208 |
+
"aggregation": "mean",
|
209 |
+
"higher_is_better": true
|
210 |
+
},
|
211 |
+
{
|
212 |
+
"metric": "acc_norm",
|
213 |
+
"aggregation": "mean",
|
214 |
+
"higher_is_better": true
|
215 |
+
}
|
216 |
+
],
|
217 |
+
"output_type": "multiple_choice",
|
218 |
+
"repeats": 1,
|
219 |
+
"should_decontaminate": false
|
220 |
+
}
|
221 |
+
},
|
222 |
+
"versions": {
|
223 |
+
"b4b": "N/A",
|
224 |
+
"b4bqa": "Yaml",
|
225 |
+
"medmcqa_g2b": "Yaml",
|
226 |
+
"medmcqa_orig_filtered": "Yaml",
|
227 |
+
"medqa_4options_g2b": "Yaml",
|
228 |
+
"medqa_4options_orig_filtered": "Yaml"
|
229 |
+
},
|
230 |
+
"n-shot": {
|
231 |
+
"b4b": 0,
|
232 |
+
"b4bqa": 0,
|
233 |
+
"medmcqa_g2b": 0,
|
234 |
+
"medmcqa_orig_filtered": 0,
|
235 |
+
"medqa_4options_g2b": 0,
|
236 |
+
"medqa_4options_orig_filtered": 0
|
237 |
+
},
|
238 |
+
"config": {
|
239 |
+
"model": "hf",
|
240 |
+
"model_args": "pretrained=CohereForAI/c4ai-command-r-plus,load_in_4bit=True",
|
241 |
+
"batch_size": "4",
|
242 |
+
"batch_sizes": [],
|
243 |
+
"device": "cuda:0",
|
244 |
+
"use_cache": null,
|
245 |
+
"limit": null,
|
246 |
+
"bootstrap_iters": 100000,
|
247 |
+
"gen_kwargs": null
|
248 |
+
},
|
249 |
+
"git_hash": "928c7657"
|
250 |
+
}
|
data/raw-eval-outputs/ProbeMedicalYonseiMAILab-medllama3-v20_results.json
ADDED
@@ -0,0 +1,252 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"b4b": {
|
4 |
+
"acc,none": 0.7475339087546239,
|
5 |
+
"acc_stderr,none": 0.0611860272880456,
|
6 |
+
"acc_norm,none": 0.7475339087546239,
|
7 |
+
"acc_norm_stderr,none": 0.0611860272880456,
|
8 |
+
"alias": "b4b"
|
9 |
+
},
|
10 |
+
"b4bqa": {
|
11 |
+
"acc,none": 0.7193080357142857,
|
12 |
+
"acc_stderr,none": 0.01061755826614456,
|
13 |
+
"acc_norm,none": 0.7193080357142857,
|
14 |
+
"acc_norm_stderr,none": 0.01061755826614456,
|
15 |
+
"alias": " - b4bqa"
|
16 |
+
},
|
17 |
+
"medmcqa_g2b": {
|
18 |
+
"acc,none": 0.6522988505747126,
|
19 |
+
"acc_stderr,none": 0.025565932174194388,
|
20 |
+
"acc_norm,none": 0.6522988505747126,
|
21 |
+
"acc_norm_stderr,none": 0.025565932174194388,
|
22 |
+
"alias": " - medmcqa_g2b"
|
23 |
+
},
|
24 |
+
"medmcqa_orig_filtered": {
|
25 |
+
"acc,none": 0.8017241379310345,
|
26 |
+
"acc_stderr,none": 0.021403394960161685,
|
27 |
+
"acc_norm,none": 0.8017241379310345,
|
28 |
+
"acc_norm_stderr,none": 0.021403394960161685,
|
29 |
+
"alias": " - medmcqa_orig_filtered"
|
30 |
+
},
|
31 |
+
"medqa_4options_g2b": {
|
32 |
+
"acc,none": 0.7645502645502645,
|
33 |
+
"acc_stderr,none": 0.021851509822031715,
|
34 |
+
"acc_norm,none": 0.7645502645502645,
|
35 |
+
"acc_norm_stderr,none": 0.021851509822031715,
|
36 |
+
"alias": " - medqa_4options_g2b"
|
37 |
+
},
|
38 |
+
"medqa_4options_orig_filtered": {
|
39 |
+
"acc,none": 0.9021164021164021,
|
40 |
+
"acc_stderr,none": 0.015304374225091422,
|
41 |
+
"acc_norm,none": 0.9021164021164021,
|
42 |
+
"acc_norm_stderr,none": 0.015304374225091422,
|
43 |
+
"alias": " - medqa_4options_orig_filtered"
|
44 |
+
}
|
45 |
+
},
|
46 |
+
"groups": {
|
47 |
+
"b4b": {
|
48 |
+
"acc,none": 0.7475339087546239,
|
49 |
+
"acc_stderr,none": 0.0611860272880456,
|
50 |
+
"acc_norm,none": 0.7475339087546239,
|
51 |
+
"acc_norm_stderr,none": 0.0611860272880456,
|
52 |
+
"alias": "b4b"
|
53 |
+
}
|
54 |
+
},
|
55 |
+
"configs": {
|
56 |
+
"b4bqa": {
|
57 |
+
"task": "b4bqa",
|
58 |
+
"dataset_path": "AIM-Harvard/b4b_drug_qa",
|
59 |
+
"test_split": "test",
|
60 |
+
"doc_to_text": "<function process_cd at 0x7f59e4b48820>",
|
61 |
+
"doc_to_target": "correct_choice",
|
62 |
+
"doc_to_choice": [
|
63 |
+
"A",
|
64 |
+
"B",
|
65 |
+
"C",
|
66 |
+
"D"
|
67 |
+
],
|
68 |
+
"description": "",
|
69 |
+
"target_delimiter": " ",
|
70 |
+
"fewshot_delimiter": "\n\n",
|
71 |
+
"metric_list": [
|
72 |
+
{
|
73 |
+
"metric": "acc",
|
74 |
+
"aggregation": "mean",
|
75 |
+
"higher_is_better": true
|
76 |
+
},
|
77 |
+
{
|
78 |
+
"metric": "acc_norm",
|
79 |
+
"aggregation": "mean",
|
80 |
+
"higher_is_better": true
|
81 |
+
}
|
82 |
+
],
|
83 |
+
"output_type": "multiple_choice",
|
84 |
+
"repeats": 1,
|
85 |
+
"should_decontaminate": false
|
86 |
+
},
|
87 |
+
"medmcqa_g2b": {
|
88 |
+
"task": "medmcqa_g2b",
|
89 |
+
"dataset_path": "AIM-Harvard/medmcqa_generic_to_brand",
|
90 |
+
"training_split": "train",
|
91 |
+
"validation_split": "validation",
|
92 |
+
"test_split": "validation",
|
93 |
+
"doc_to_text": "<function doc_to_text at 0x7f59e4b92050>",
|
94 |
+
"doc_to_target": "cop",
|
95 |
+
"doc_to_choice": [
|
96 |
+
"A",
|
97 |
+
"B",
|
98 |
+
"C",
|
99 |
+
"D"
|
100 |
+
],
|
101 |
+
"description": "",
|
102 |
+
"target_delimiter": " ",
|
103 |
+
"fewshot_delimiter": "\n\n",
|
104 |
+
"metric_list": [
|
105 |
+
{
|
106 |
+
"metric": "acc",
|
107 |
+
"aggregation": "mean",
|
108 |
+
"higher_is_better": true
|
109 |
+
},
|
110 |
+
{
|
111 |
+
"metric": "acc_norm",
|
112 |
+
"aggregation": "mean",
|
113 |
+
"higher_is_better": true
|
114 |
+
}
|
115 |
+
],
|
116 |
+
"output_type": "multiple_choice",
|
117 |
+
"repeats": 1,
|
118 |
+
"should_decontaminate": true,
|
119 |
+
"doc_to_decontamination_query": "{{question}}"
|
120 |
+
},
|
121 |
+
"medmcqa_orig_filtered": {
|
122 |
+
"task": "medmcqa_orig_filtered",
|
123 |
+
"dataset_path": "AIM-Harvard/medmcqa_original",
|
124 |
+
"training_split": "train",
|
125 |
+
"validation_split": "validation",
|
126 |
+
"test_split": "validation",
|
127 |
+
"doc_to_text": "<function doc_to_text at 0x7f59d579d240>",
|
128 |
+
"doc_to_target": "cop",
|
129 |
+
"doc_to_choice": [
|
130 |
+
"A",
|
131 |
+
"B",
|
132 |
+
"C",
|
133 |
+
"D"
|
134 |
+
],
|
135 |
+
"description": "",
|
136 |
+
"target_delimiter": " ",
|
137 |
+
"fewshot_delimiter": "\n\n",
|
138 |
+
"metric_list": [
|
139 |
+
{
|
140 |
+
"metric": "acc",
|
141 |
+
"aggregation": "mean",
|
142 |
+
"higher_is_better": true
|
143 |
+
},
|
144 |
+
{
|
145 |
+
"metric": "acc_norm",
|
146 |
+
"aggregation": "mean",
|
147 |
+
"higher_is_better": true
|
148 |
+
}
|
149 |
+
],
|
150 |
+
"output_type": "multiple_choice",
|
151 |
+
"repeats": 1,
|
152 |
+
"should_decontaminate": true,
|
153 |
+
"doc_to_decontamination_query": "{{question}}"
|
154 |
+
},
|
155 |
+
"medqa_4options_g2b": {
|
156 |
+
"task": "medqa_4options_g2b",
|
157 |
+
"dataset_path": "AIM-Harvard/gbaker_medqa_usmle_4_options_hf_generic_to_brand",
|
158 |
+
"training_split": "train",
|
159 |
+
"validation_split": "validation",
|
160 |
+
"test_split": "test",
|
161 |
+
"doc_to_text": "<function doc_to_text at 0x7f59e4a54820>",
|
162 |
+
"doc_to_target": "<function doc_to_target at 0x7f59e4a54b80>",
|
163 |
+
"doc_to_choice": [
|
164 |
+
"A",
|
165 |
+
"B",
|
166 |
+
"C",
|
167 |
+
"D"
|
168 |
+
],
|
169 |
+
"description": "",
|
170 |
+
"target_delimiter": " ",
|
171 |
+
"fewshot_delimiter": "\n\n",
|
172 |
+
"metric_list": [
|
173 |
+
{
|
174 |
+
"metric": "acc",
|
175 |
+
"aggregation": "mean",
|
176 |
+
"higher_is_better": true
|
177 |
+
},
|
178 |
+
{
|
179 |
+
"metric": "acc_norm",
|
180 |
+
"aggregation": "mean",
|
181 |
+
"higher_is_better": true
|
182 |
+
}
|
183 |
+
],
|
184 |
+
"output_type": "multiple_choice",
|
185 |
+
"repeats": 1,
|
186 |
+
"should_decontaminate": false
|
187 |
+
},
|
188 |
+
"medqa_4options_orig_filtered": {
|
189 |
+
"task": "medqa_4options_orig_filtered",
|
190 |
+
"dataset_path": "AIM-Harvard/gbaker_medqa_usmle_4_options_hf_original",
|
191 |
+
"training_split": "train",
|
192 |
+
"validation_split": "validation",
|
193 |
+
"test_split": "test",
|
194 |
+
"doc_to_text": "<function doc_to_text at 0x7f59e4b0e680>",
|
195 |
+
"doc_to_target": "<function doc_to_target at 0x7f59e4b2d120>",
|
196 |
+
"doc_to_choice": [
|
197 |
+
"A",
|
198 |
+
"B",
|
199 |
+
"C",
|
200 |
+
"D"
|
201 |
+
],
|
202 |
+
"description": "",
|
203 |
+
"target_delimiter": " ",
|
204 |
+
"fewshot_delimiter": "\n\n",
|
205 |
+
"metric_list": [
|
206 |
+
{
|
207 |
+
"metric": "acc",
|
208 |
+
"aggregation": "mean",
|
209 |
+
"higher_is_better": true
|
210 |
+
},
|
211 |
+
{
|
212 |
+
"metric": "acc_norm",
|
213 |
+
"aggregation": "mean",
|
214 |
+
"higher_is_better": true
|
215 |
+
}
|
216 |
+
],
|
217 |
+
"output_type": "multiple_choice",
|
218 |
+
"repeats": 1,
|
219 |
+
"should_decontaminate": false
|
220 |
+
}
|
221 |
+
},
|
222 |
+
"versions": {
|
223 |
+
"b4b": "N/A",
|
224 |
+
"b4bqa": "Yaml",
|
225 |
+
"medmcqa_g2b": "Yaml",
|
226 |
+
"medmcqa_orig_filtered": "Yaml",
|
227 |
+
"medqa_4options_g2b": "Yaml",
|
228 |
+
"medqa_4options_orig_filtered": "Yaml"
|
229 |
+
},
|
230 |
+
"n-shot": {
|
231 |
+
"b4b": 0,
|
232 |
+
"b4bqa": 0,
|
233 |
+
"medmcqa_g2b": 0,
|
234 |
+
"medmcqa_orig_filtered": 0,
|
235 |
+
"medqa_4options_g2b": 0,
|
236 |
+
"medqa_4options_orig_filtered": 0
|
237 |
+
},
|
238 |
+
"config": {
|
239 |
+
"model": "hf",
|
240 |
+
"model_args": "pretrained=ProbeMedicalYonseiMAILab/medllama3-v20,parallelize=True,load_in_4bit=True",
|
241 |
+
"batch_size": "auto",
|
242 |
+
"batch_sizes": [
|
243 |
+
32
|
244 |
+
],
|
245 |
+
"device": null,
|
246 |
+
"use_cache": null,
|
247 |
+
"limit": null,
|
248 |
+
"bootstrap_iters": 100000,
|
249 |
+
"gen_kwargs": null
|
250 |
+
},
|
251 |
+
"git_hash": "928c7657"
|
252 |
+
}
|
data/raw-eval-outputs/Qwen-Qwen2-72B_results.json
ADDED
@@ -0,0 +1,250 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"b4b": {
|
4 |
+
"acc,none": 0.8372379778051788,
|
5 |
+
"acc_stderr,none": 0.07216098703042964,
|
6 |
+
"acc_norm,none": 0.8372379778051788,
|
7 |
+
"acc_norm_stderr,none": 0.07216098703042964,
|
8 |
+
"alias": "b4b"
|
9 |
+
},
|
10 |
+
"b4bqa": {
|
11 |
+
"acc,none": 0.91015625,
|
12 |
+
"acc_stderr,none": 0.006757003132881115,
|
13 |
+
"acc_norm,none": 0.91015625,
|
14 |
+
"acc_norm_stderr,none": 0.006757003132881115,
|
15 |
+
"alias": " - b4bqa"
|
16 |
+
},
|
17 |
+
"medmcqa_g2b": {
|
18 |
+
"acc,none": 0.7155172413793104,
|
19 |
+
"acc_stderr,none": 0.024219952635630794,
|
20 |
+
"acc_norm,none": 0.7155172413793104,
|
21 |
+
"acc_norm_stderr,none": 0.024219952635630794,
|
22 |
+
"alias": " - medmcqa_g2b"
|
23 |
+
},
|
24 |
+
"medmcqa_orig_filtered": {
|
25 |
+
"acc,none": 0.7787356321839081,
|
26 |
+
"acc_stderr,none": 0.02228363451068677,
|
27 |
+
"acc_norm,none": 0.7787356321839081,
|
28 |
+
"acc_norm_stderr,none": 0.02228363451068677,
|
29 |
+
"alias": " - medmcqa_orig_filtered"
|
30 |
+
},
|
31 |
+
"medqa_4options_g2b": {
|
32 |
+
"acc,none": 0.7407407407407407,
|
33 |
+
"acc_stderr,none": 0.022569897074918417,
|
34 |
+
"acc_norm,none": 0.7407407407407407,
|
35 |
+
"acc_norm_stderr,none": 0.022569897074918417,
|
36 |
+
"alias": " - medqa_4options_g2b"
|
37 |
+
},
|
38 |
+
"medqa_4options_orig_filtered": {
|
39 |
+
"acc,none": 0.753968253968254,
|
40 |
+
"acc_stderr,none": 0.022182037202948368,
|
41 |
+
"acc_norm,none": 0.753968253968254,
|
42 |
+
"acc_norm_stderr,none": 0.022182037202948368,
|
43 |
+
"alias": " - medqa_4options_orig_filtered"
|
44 |
+
}
|
45 |
+
},
|
46 |
+
"groups": {
|
47 |
+
"b4b": {
|
48 |
+
"acc,none": 0.8372379778051788,
|
49 |
+
"acc_stderr,none": 0.07216098703042964,
|
50 |
+
"acc_norm,none": 0.8372379778051788,
|
51 |
+
"acc_norm_stderr,none": 0.07216098703042964,
|
52 |
+
"alias": "b4b"
|
53 |
+
}
|
54 |
+
},
|
55 |
+
"configs": {
|
56 |
+
"b4bqa": {
|
57 |
+
"task": "b4bqa",
|
58 |
+
"dataset_path": "AIM-Harvard/b4b_drug_qa",
|
59 |
+
"test_split": "test",
|
60 |
+
"doc_to_text": "<function process_cd at 0x7fe2a537cf70>",
|
61 |
+
"doc_to_target": "correct_choice",
|
62 |
+
"doc_to_choice": [
|
63 |
+
"A",
|
64 |
+
"B",
|
65 |
+
"C",
|
66 |
+
"D"
|
67 |
+
],
|
68 |
+
"description": "",
|
69 |
+
"target_delimiter": " ",
|
70 |
+
"fewshot_delimiter": "\n\n",
|
71 |
+
"metric_list": [
|
72 |
+
{
|
73 |
+
"metric": "acc",
|
74 |
+
"aggregation": "mean",
|
75 |
+
"higher_is_better": true
|
76 |
+
},
|
77 |
+
{
|
78 |
+
"metric": "acc_norm",
|
79 |
+
"aggregation": "mean",
|
80 |
+
"higher_is_better": true
|
81 |
+
}
|
82 |
+
],
|
83 |
+
"output_type": "multiple_choice",
|
84 |
+
"repeats": 1,
|
85 |
+
"should_decontaminate": false
|
86 |
+
},
|
87 |
+
"medmcqa_g2b": {
|
88 |
+
"task": "medmcqa_g2b",
|
89 |
+
"dataset_path": "AIM-Harvard/medmcqa_generic_to_brand",
|
90 |
+
"training_split": "train",
|
91 |
+
"validation_split": "validation",
|
92 |
+
"test_split": "validation",
|
93 |
+
"doc_to_text": "<function doc_to_text at 0x7fe2a4fa0430>",
|
94 |
+
"doc_to_target": "cop",
|
95 |
+
"doc_to_choice": [
|
96 |
+
"A",
|
97 |
+
"B",
|
98 |
+
"C",
|
99 |
+
"D"
|
100 |
+
],
|
101 |
+
"description": "",
|
102 |
+
"target_delimiter": " ",
|
103 |
+
"fewshot_delimiter": "\n\n",
|
104 |
+
"metric_list": [
|
105 |
+
{
|
106 |
+
"metric": "acc",
|
107 |
+
"aggregation": "mean",
|
108 |
+
"higher_is_better": true
|
109 |
+
},
|
110 |
+
{
|
111 |
+
"metric": "acc_norm",
|
112 |
+
"aggregation": "mean",
|
113 |
+
"higher_is_better": true
|
114 |
+
}
|
115 |
+
],
|
116 |
+
"output_type": "multiple_choice",
|
117 |
+
"repeats": 1,
|
118 |
+
"should_decontaminate": true,
|
119 |
+
"doc_to_decontamination_query": "{{question}}"
|
120 |
+
},
|
121 |
+
"medmcqa_orig_filtered": {
|
122 |
+
"task": "medmcqa_orig_filtered",
|
123 |
+
"dataset_path": "AIM-Harvard/medmcqa_original",
|
124 |
+
"training_split": "train",
|
125 |
+
"validation_split": "validation",
|
126 |
+
"test_split": "validation",
|
127 |
+
"doc_to_text": "<function doc_to_text at 0x7fe2a4fa05e0>",
|
128 |
+
"doc_to_target": "cop",
|
129 |
+
"doc_to_choice": [
|
130 |
+
"A",
|
131 |
+
"B",
|
132 |
+
"C",
|
133 |
+
"D"
|
134 |
+
],
|
135 |
+
"description": "",
|
136 |
+
"target_delimiter": " ",
|
137 |
+
"fewshot_delimiter": "\n\n",
|
138 |
+
"metric_list": [
|
139 |
+
{
|
140 |
+
"metric": "acc",
|
141 |
+
"aggregation": "mean",
|
142 |
+
"higher_is_better": true
|
143 |
+
},
|
144 |
+
{
|
145 |
+
"metric": "acc_norm",
|
146 |
+
"aggregation": "mean",
|
147 |
+
"higher_is_better": true
|
148 |
+
}
|
149 |
+
],
|
150 |
+
"output_type": "multiple_choice",
|
151 |
+
"repeats": 1,
|
152 |
+
"should_decontaminate": true,
|
153 |
+
"doc_to_decontamination_query": "{{question}}"
|
154 |
+
},
|
155 |
+
"medqa_4options_g2b": {
|
156 |
+
"task": "medqa_4options_g2b",
|
157 |
+
"dataset_path": "AIM-Harvard/gbaker_medqa_usmle_4_options_hf_generic_to_brand",
|
158 |
+
"training_split": "train",
|
159 |
+
"validation_split": "validation",
|
160 |
+
"test_split": "test",
|
161 |
+
"doc_to_text": "<function doc_to_text at 0x7fe2a4fa35b0>",
|
162 |
+
"doc_to_target": "<function doc_to_target at 0x7fe2a4fa3910>",
|
163 |
+
"doc_to_choice": [
|
164 |
+
"A",
|
165 |
+
"B",
|
166 |
+
"C",
|
167 |
+
"D"
|
168 |
+
],
|
169 |
+
"description": "",
|
170 |
+
"target_delimiter": " ",
|
171 |
+
"fewshot_delimiter": "\n\n",
|
172 |
+
"metric_list": [
|
173 |
+
{
|
174 |
+
"metric": "acc",
|
175 |
+
"aggregation": "mean",
|
176 |
+
"higher_is_better": true
|
177 |
+
},
|
178 |
+
{
|
179 |
+
"metric": "acc_norm",
|
180 |
+
"aggregation": "mean",
|
181 |
+
"higher_is_better": true
|
182 |
+
}
|
183 |
+
],
|
184 |
+
"output_type": "multiple_choice",
|
185 |
+
"repeats": 1,
|
186 |
+
"should_decontaminate": false
|
187 |
+
},
|
188 |
+
"medqa_4options_orig_filtered": {
|
189 |
+
"task": "medqa_4options_orig_filtered",
|
190 |
+
"dataset_path": "AIM-Harvard/gbaker_medqa_usmle_4_options_hf_original",
|
191 |
+
"training_split": "train",
|
192 |
+
"validation_split": "validation",
|
193 |
+
"test_split": "test",
|
194 |
+
"doc_to_text": "<function doc_to_text at 0x7fe2a4fa3b50>",
|
195 |
+
"doc_to_target": "<function doc_to_target at 0x7fe2a4fa3d90>",
|
196 |
+
"doc_to_choice": [
|
197 |
+
"A",
|
198 |
+
"B",
|
199 |
+
"C",
|
200 |
+
"D"
|
201 |
+
],
|
202 |
+
"description": "",
|
203 |
+
"target_delimiter": " ",
|
204 |
+
"fewshot_delimiter": "\n\n",
|
205 |
+
"metric_list": [
|
206 |
+
{
|
207 |
+
"metric": "acc",
|
208 |
+
"aggregation": "mean",
|
209 |
+
"higher_is_better": true
|
210 |
+
},
|
211 |
+
{
|
212 |
+
"metric": "acc_norm",
|
213 |
+
"aggregation": "mean",
|
214 |
+
"higher_is_better": true
|
215 |
+
}
|
216 |
+
],
|
217 |
+
"output_type": "multiple_choice",
|
218 |
+
"repeats": 1,
|
219 |
+
"should_decontaminate": false
|
220 |
+
}
|
221 |
+
},
|
222 |
+
"versions": {
|
223 |
+
"b4b": "N/A",
|
224 |
+
"b4bqa": "Yaml",
|
225 |
+
"medmcqa_g2b": "Yaml",
|
226 |
+
"medmcqa_orig_filtered": "Yaml",
|
227 |
+
"medqa_4options_g2b": "Yaml",
|
228 |
+
"medqa_4options_orig_filtered": "Yaml"
|
229 |
+
},
|
230 |
+
"n-shot": {
|
231 |
+
"b4b": 0,
|
232 |
+
"b4bqa": 0,
|
233 |
+
"medmcqa_g2b": 0,
|
234 |
+
"medmcqa_orig_filtered": 0,
|
235 |
+
"medqa_4options_g2b": 0,
|
236 |
+
"medqa_4options_orig_filtered": 0
|
237 |
+
},
|
238 |
+
"config": {
|
239 |
+
"model": "hf",
|
240 |
+
"model_args": "pretrained=Qwen/Qwen2-72B,load_in_4bit=True",
|
241 |
+
"batch_size": "4",
|
242 |
+
"batch_sizes": [],
|
243 |
+
"device": "cuda:0",
|
244 |
+
"use_cache": null,
|
245 |
+
"limit": null,
|
246 |
+
"bootstrap_iters": 100000,
|
247 |
+
"gen_kwargs": null
|
248 |
+
},
|
249 |
+
"git_hash": "928c7657"
|
250 |
+
}
|
data/raw-eval-outputs/Qwen-Qwen2-7B_results.json
ADDED
@@ -0,0 +1,316 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"b4b": {
|
4 |
+
"acc,none": 0.7028360049321827,
|
5 |
+
"acc_stderr,none": 0.1004832322485701,
|
6 |
+
"acc_norm,none": 0.7028360049321827,
|
7 |
+
"acc_norm_stderr,none": 0.1004832322485701,
|
8 |
+
"alias": "b4b"
|
9 |
+
},
|
10 |
+
"b4bqa": {
|
11 |
+
"acc,none": 0.8041294642857143,
|
12 |
+
"acc_stderr,none": 0.009377773744245437,
|
13 |
+
"acc_norm,none": 0.8041294642857143,
|
14 |
+
"acc_norm_stderr,none": 0.009377773744245437,
|
15 |
+
"alias": " - b4bqa"
|
16 |
+
},
|
17 |
+
"medmcqa_g2b": {
|
18 |
+
"acc,none": 0.5517241379310345,
|
19 |
+
"acc_stderr,none": 0.02669739777037782,
|
20 |
+
"acc_norm,none": 0.5517241379310345,
|
21 |
+
"acc_norm_stderr,none": 0.02669739777037782,
|
22 |
+
"alias": " - medmcqa_g2b"
|
23 |
+
},
|
24 |
+
"medmcqa_orig_filtered": {
|
25 |
+
"acc,none": 0.6350574712643678,
|
26 |
+
"acc_stderr,none": 0.025843659831273274,
|
27 |
+
"acc_norm,none": 0.6350574712643678,
|
28 |
+
"acc_norm_stderr,none": 0.025843659831273274,
|
29 |
+
"alias": " - medmcqa_orig_filtered"
|
30 |
+
},
|
31 |
+
"medqa_4options_g2b": {
|
32 |
+
"acc,none": 0.5370370370370371,
|
33 |
+
"acc_stderr,none": 0.025680564640056882,
|
34 |
+
"acc_norm,none": 0.5370370370370371,
|
35 |
+
"acc_norm_stderr,none": 0.025680564640056882,
|
36 |
+
"alias": " - medqa_4options_g2b"
|
37 |
+
},
|
38 |
+
"medqa_4options_orig_filtered": {
|
39 |
+
"acc,none": 0.58994708994709,
|
40 |
+
"acc_stderr,none": 0.025331202438944444,
|
41 |
+
"acc_norm,none": 0.58994708994709,
|
42 |
+
"acc_norm_stderr,none": 0.025331202438944444,
|
43 |
+
"alias": " - medqa_4options_orig_filtered"
|
44 |
+
}
|
45 |
+
},
|
46 |
+
"groups": {
|
47 |
+
"b4b": {
|
48 |
+
"acc,none": 0.7028360049321827,
|
49 |
+
"acc_stderr,none": 0.1004832322485701,
|
50 |
+
"acc_norm,none": 0.7028360049321827,
|
51 |
+
"acc_norm_stderr,none": 0.1004832322485701,
|
52 |
+
"alias": "b4b"
|
53 |
+
}
|
54 |
+
},
|
55 |
+
"configs": {
|
56 |
+
"b4bqa": {
|
57 |
+
"task": "b4bqa",
|
58 |
+
"dataset_path": "AIM-Harvard/b4b_drug_qa",
|
59 |
+
"test_split": "test",
|
60 |
+
"doc_to_text": "<function process_cd at 0x7f8319c60ee0>",
|
61 |
+
"doc_to_target": "correct_choice",
|
62 |
+
"doc_to_choice": [
|
63 |
+
"A",
|
64 |
+
"B",
|
65 |
+
"C",
|
66 |
+
"D"
|
67 |
+
],
|
68 |
+
"description": "",
|
69 |
+
"target_delimiter": " ",
|
70 |
+
"fewshot_delimiter": "\n\n",
|
71 |
+
"metric_list": [
|
72 |
+
{
|
73 |
+
"metric": "acc",
|
74 |
+
"aggregation": "mean",
|
75 |
+
"higher_is_better": true
|
76 |
+
},
|
77 |
+
{
|
78 |
+
"metric": "acc_norm",
|
79 |
+
"aggregation": "mean",
|
80 |
+
"higher_is_better": true
|
81 |
+
}
|
82 |
+
],
|
83 |
+
"output_type": "multiple_choice",
|
84 |
+
"repeats": 1,
|
85 |
+
"should_decontaminate": false
|
86 |
+
},
|
87 |
+
"medmcqa_g2b": {
|
88 |
+
"task": "medmcqa_g2b",
|
89 |
+
"dataset_path": "AIM-Harvard/medmcqa_generic_to_brand",
|
90 |
+
"training_split": "train",
|
91 |
+
"validation_split": "validation",
|
92 |
+
"test_split": "validation",
|
93 |
+
"doc_to_text": "<function doc_to_text at 0x7f831a19e3a0>",
|
94 |
+
"doc_to_target": "cop",
|
95 |
+
"doc_to_choice": [
|
96 |
+
"A",
|
97 |
+
"B",
|
98 |
+
"C",
|
99 |
+
"D"
|
100 |
+
],
|
101 |
+
"description": "",
|
102 |
+
"target_delimiter": " ",
|
103 |
+
"fewshot_delimiter": "\n\n",
|
104 |
+
"metric_list": [
|
105 |
+
{
|
106 |
+
"metric": "acc",
|
107 |
+
"aggregation": "mean",
|
108 |
+
"higher_is_better": true
|
109 |
+
},
|
110 |
+
{
|
111 |
+
"metric": "acc_norm",
|
112 |
+
"aggregation": "mean",
|
113 |
+
"higher_is_better": true
|
114 |
+
}
|
115 |
+
],
|
116 |
+
"output_type": "multiple_choice",
|
117 |
+
"repeats": 1,
|
118 |
+
"should_decontaminate": true,
|
119 |
+
"doc_to_decontamination_query": "{{question}}"
|
120 |
+
},
|
121 |
+
"medmcqa_orig_filtered": {
|
122 |
+
"task": "medmcqa_orig_filtered",
|
123 |
+
"dataset_path": "AIM-Harvard/medmcqa_original",
|
124 |
+
"training_split": "train",
|
125 |
+
"validation_split": "validation",
|
126 |
+
"test_split": "validation",
|
127 |
+
"doc_to_text": "<function doc_to_text at 0x7f8319ac8310>",
|
128 |
+
"doc_to_target": "cop",
|
129 |
+
"doc_to_choice": [
|
130 |
+
"A",
|
131 |
+
"B",
|
132 |
+
"C",
|
133 |
+
"D"
|
134 |
+
],
|
135 |
+
"description": "",
|
136 |
+
"target_delimiter": " ",
|
137 |
+
"fewshot_delimiter": "\n\n",
|
138 |
+
"metric_list": [
|
139 |
+
{
|
140 |
+
"metric": "acc",
|
141 |
+
"aggregation": "mean",
|
142 |
+
"higher_is_better": true
|
143 |
+
},
|
144 |
+
{
|
145 |
+
"metric": "acc_norm",
|
146 |
+
"aggregation": "mean",
|
147 |
+
"higher_is_better": true
|
148 |
+
}
|
149 |
+
],
|
150 |
+
"output_type": "multiple_choice",
|
151 |
+
"repeats": 1,
|
152 |
+
"should_decontaminate": true,
|
153 |
+
"doc_to_decontamination_query": "{{question}}"
|
154 |
+
},
|
155 |
+
"medqa_4options_g2b": {
|
156 |
+
"task": "medqa_4options_g2b",
|
157 |
+
"dataset_path": "AIM-Harvard/gbaker_medqa_usmle_4_options_hf_generic_to_brand",
|
158 |
+
"training_split": "train",
|
159 |
+
"validation_split": "validation",
|
160 |
+
"test_split": "test",
|
161 |
+
"doc_to_text": "<function doc_to_text at 0x7f831a19e820>",
|
162 |
+
"doc_to_target": "<function doc_to_target at 0x7f831a19eb80>",
|
163 |
+
"doc_to_choice": [
|
164 |
+
"A",
|
165 |
+
"B",
|
166 |
+
"C",
|
167 |
+
"D"
|
168 |
+
],
|
169 |
+
"description": "",
|
170 |
+
"target_delimiter": " ",
|
171 |
+
"fewshot_delimiter": "\n\n",
|
172 |
+
"metric_list": [
|
173 |
+
{
|
174 |
+
"metric": "acc",
|
175 |
+
"aggregation": "mean",
|
176 |
+
"higher_is_better": true
|
177 |
+
},
|
178 |
+
{
|
179 |
+
"metric": "acc_norm",
|
180 |
+
"aggregation": "mean",
|
181 |
+
"higher_is_better": true
|
182 |
+
}
|
183 |
+
],
|
184 |
+
"output_type": "multiple_choice",
|
185 |
+
"repeats": 1,
|
186 |
+
"should_decontaminate": false
|
187 |
+
},
|
188 |
+
"medqa_4options_orig_filtered": {
|
189 |
+
"task": "medqa_4options_orig_filtered",
|
190 |
+
"dataset_path": "AIM-Harvard/gbaker_medqa_usmle_4_options_hf_original",
|
191 |
+
"training_split": "train",
|
192 |
+
"validation_split": "validation",
|
193 |
+
"test_split": "test",
|
194 |
+
"doc_to_text": "<function doc_to_text at 0x7f8319c844c0>",
|
195 |
+
"doc_to_target": "<function doc_to_target at 0x7f8319c30ee0>",
|
196 |
+
"doc_to_choice": [
|
197 |
+
"A",
|
198 |
+
"B",
|
199 |
+
"C",
|
200 |
+
"D"
|
201 |
+
],
|
202 |
+
"description": "",
|
203 |
+
"target_delimiter": " ",
|
204 |
+
"fewshot_delimiter": "\n\n",
|
205 |
+
"metric_list": [
|
206 |
+
{
|
207 |
+
"metric": "acc",
|
208 |
+
"aggregation": "mean",
|
209 |
+
"higher_is_better": true
|
210 |
+
},
|
211 |
+
{
|
212 |
+
"metric": "acc_norm",
|
213 |
+
"aggregation": "mean",
|
214 |
+
"higher_is_better": true
|
215 |
+
}
|
216 |
+
],
|
217 |
+
"output_type": "multiple_choice",
|
218 |
+
"repeats": 1,
|
219 |
+
"should_decontaminate": false
|
220 |
+
}
|
221 |
+
},
|
222 |
+
"versions": {
|
223 |
+
"b4b": "N/A",
|
224 |
+
"b4bqa": "Yaml",
|
225 |
+
"medmcqa_g2b": "Yaml",
|
226 |
+
"medmcqa_orig_filtered": "Yaml",
|
227 |
+
"medqa_4options_g2b": "Yaml",
|
228 |
+
"medqa_4options_orig_filtered": "Yaml"
|
229 |
+
},
|
230 |
+
"n-shot": {
|
231 |
+
"b4b": 0,
|
232 |
+
"b4bqa": 0,
|
233 |
+
"medmcqa_g2b": 0,
|
234 |
+
"medmcqa_orig_filtered": 0,
|
235 |
+
"medqa_4options_g2b": 0,
|
236 |
+
"medqa_4options_orig_filtered": 0
|
237 |
+
},
|
238 |
+
"config": {
|
239 |
+
"model": "hf",
|
240 |
+
"model_args": "pretrained=Qwen/Qwen2-7B,load_in_4bit=True",
|
241 |
+
"batch_size": "auto:64",
|
242 |
+
"batch_sizes": [
|
243 |
+
8,
|
244 |
+
8,
|
245 |
+
16,
|
246 |
+
16,
|
247 |
+
16,
|
248 |
+
16,
|
249 |
+
16,
|
250 |
+
16,
|
251 |
+
16,
|
252 |
+
16,
|
253 |
+
16,
|
254 |
+
16,
|
255 |
+
32,
|
256 |
+
32,
|
257 |
+
32,
|
258 |
+
64,
|
259 |
+
64,
|
260 |
+
64,
|
261 |
+
64,
|
262 |
+
64,
|
263 |
+
64,
|
264 |
+
64,
|
265 |
+
64,
|
266 |
+
64,
|
267 |
+
64,
|
268 |
+
64,
|
269 |
+
64,
|
270 |
+
64,
|
271 |
+
64,
|
272 |
+
64,
|
273 |
+
64,
|
274 |
+
64,
|
275 |
+
64,
|
276 |
+
64,
|
277 |
+
64,
|
278 |
+
64,
|
279 |
+
64,
|
280 |
+
64,
|
281 |
+
64,
|
282 |
+
64,
|
283 |
+
64,
|
284 |
+
64,
|
285 |
+
64,
|
286 |
+
64,
|
287 |
+
64,
|
288 |
+
64,
|
289 |
+
64,
|
290 |
+
64,
|
291 |
+
64,
|
292 |
+
64,
|
293 |
+
64,
|
294 |
+
64,
|
295 |
+
64,
|
296 |
+
64,
|
297 |
+
64,
|
298 |
+
64,
|
299 |
+
64,
|
300 |
+
64,
|
301 |
+
64,
|
302 |
+
64,
|
303 |
+
64,
|
304 |
+
64,
|
305 |
+
64,
|
306 |
+
64,
|
307 |
+
64
|
308 |
+
],
|
309 |
+
"device": "cuda:0",
|
310 |
+
"use_cache": null,
|
311 |
+
"limit": null,
|
312 |
+
"bootstrap_iters": 100000,
|
313 |
+
"gen_kwargs": null
|
314 |
+
},
|
315 |
+
"git_hash": "928c7657"
|
316 |
+
}
|
data/raw-eval-outputs/aaditya-Llama3-OpenBioLLM-70B_results.json
ADDED
@@ -0,0 +1,252 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"b4b": {
|
4 |
+
"acc,none": 0.7876078914919852,
|
5 |
+
"acc_stderr,none": 0.06728010300021042,
|
6 |
+
"acc_norm,none": 0.7876078914919852,
|
7 |
+
"acc_norm_stderr,none": 0.06728010300021042,
|
8 |
+
"alias": "b4b"
|
9 |
+
},
|
10 |
+
"b4bqa": {
|
11 |
+
"acc,none": 0.8510044642857143,
|
12 |
+
"acc_stderr,none": 0.008414043525477657,
|
13 |
+
"acc_norm,none": 0.8510044642857143,
|
14 |
+
"acc_norm_stderr,none": 0.008414043525477657,
|
15 |
+
"alias": " - b4bqa"
|
16 |
+
},
|
17 |
+
"medmcqa_g2b": {
|
18 |
+
"acc,none": 0.632183908045977,
|
19 |
+
"acc_stderr,none": 0.025886440903166212,
|
20 |
+
"acc_norm,none": 0.632183908045977,
|
21 |
+
"acc_norm_stderr,none": 0.025886440903166212,
|
22 |
+
"alias": " - medmcqa_g2b"
|
23 |
+
},
|
24 |
+
"medmcqa_orig_filtered": {
|
25 |
+
"acc,none": 0.7385057471264368,
|
26 |
+
"acc_stderr,none": 0.023590833013480327,
|
27 |
+
"acc_norm,none": 0.7385057471264368,
|
28 |
+
"acc_norm_stderr,none": 0.023590833013480327,
|
29 |
+
"alias": " - medmcqa_orig_filtered"
|
30 |
+
},
|
31 |
+
"medqa_4options_g2b": {
|
32 |
+
"acc,none": 0.708994708994709,
|
33 |
+
"acc_stderr,none": 0.02339382650048486,
|
34 |
+
"acc_norm,none": 0.708994708994709,
|
35 |
+
"acc_norm_stderr,none": 0.02339382650048486,
|
36 |
+
"alias": " - medqa_4options_g2b"
|
37 |
+
},
|
38 |
+
"medqa_4options_orig_filtered": {
|
39 |
+
"acc,none": 0.753968253968254,
|
40 |
+
"acc_stderr,none": 0.022182037202948368,
|
41 |
+
"acc_norm,none": 0.753968253968254,
|
42 |
+
"acc_norm_stderr,none": 0.022182037202948368,
|
43 |
+
"alias": " - medqa_4options_orig_filtered"
|
44 |
+
}
|
45 |
+
},
|
46 |
+
"groups": {
|
47 |
+
"b4b": {
|
48 |
+
"acc,none": 0.7876078914919852,
|
49 |
+
"acc_stderr,none": 0.06728010300021042,
|
50 |
+
"acc_norm,none": 0.7876078914919852,
|
51 |
+
"acc_norm_stderr,none": 0.06728010300021042,
|
52 |
+
"alias": "b4b"
|
53 |
+
}
|
54 |
+
},
|
55 |
+
"configs": {
|
56 |
+
"b4bqa": {
|
57 |
+
"task": "b4bqa",
|
58 |
+
"dataset_path": "AIM-Harvard/b4b_drug_qa",
|
59 |
+
"test_split": "test",
|
60 |
+
"doc_to_text": "<function process_cd at 0x7f1d7499c820>",
|
61 |
+
"doc_to_target": "correct_choice",
|
62 |
+
"doc_to_choice": [
|
63 |
+
"A",
|
64 |
+
"B",
|
65 |
+
"C",
|
66 |
+
"D"
|
67 |
+
],
|
68 |
+
"description": "",
|
69 |
+
"target_delimiter": " ",
|
70 |
+
"fewshot_delimiter": "\n\n",
|
71 |
+
"metric_list": [
|
72 |
+
{
|
73 |
+
"metric": "acc",
|
74 |
+
"aggregation": "mean",
|
75 |
+
"higher_is_better": true
|
76 |
+
},
|
77 |
+
{
|
78 |
+
"metric": "acc_norm",
|
79 |
+
"aggregation": "mean",
|
80 |
+
"higher_is_better": true
|
81 |
+
}
|
82 |
+
],
|
83 |
+
"output_type": "multiple_choice",
|
84 |
+
"repeats": 1,
|
85 |
+
"should_decontaminate": false
|
86 |
+
},
|
87 |
+
"medmcqa_g2b": {
|
88 |
+
"task": "medmcqa_g2b",
|
89 |
+
"dataset_path": "AIM-Harvard/medmcqa_generic_to_brand",
|
90 |
+
"training_split": "train",
|
91 |
+
"validation_split": "validation",
|
92 |
+
"test_split": "validation",
|
93 |
+
"doc_to_text": "<function doc_to_text at 0x7f1d749e6050>",
|
94 |
+
"doc_to_target": "cop",
|
95 |
+
"doc_to_choice": [
|
96 |
+
"A",
|
97 |
+
"B",
|
98 |
+
"C",
|
99 |
+
"D"
|
100 |
+
],
|
101 |
+
"description": "",
|
102 |
+
"target_delimiter": " ",
|
103 |
+
"fewshot_delimiter": "\n\n",
|
104 |
+
"metric_list": [
|
105 |
+
{
|
106 |
+
"metric": "acc",
|
107 |
+
"aggregation": "mean",
|
108 |
+
"higher_is_better": true
|
109 |
+
},
|
110 |
+
{
|
111 |
+
"metric": "acc_norm",
|
112 |
+
"aggregation": "mean",
|
113 |
+
"higher_is_better": true
|
114 |
+
}
|
115 |
+
],
|
116 |
+
"output_type": "multiple_choice",
|
117 |
+
"repeats": 1,
|
118 |
+
"should_decontaminate": true,
|
119 |
+
"doc_to_decontamination_query": "{{question}}"
|
120 |
+
},
|
121 |
+
"medmcqa_orig_filtered": {
|
122 |
+
"task": "medmcqa_orig_filtered",
|
123 |
+
"dataset_path": "AIM-Harvard/medmcqa_original",
|
124 |
+
"training_split": "train",
|
125 |
+
"validation_split": "validation",
|
126 |
+
"test_split": "validation",
|
127 |
+
"doc_to_text": "<function doc_to_text at 0x7f1d66619240>",
|
128 |
+
"doc_to_target": "cop",
|
129 |
+
"doc_to_choice": [
|
130 |
+
"A",
|
131 |
+
"B",
|
132 |
+
"C",
|
133 |
+
"D"
|
134 |
+
],
|
135 |
+
"description": "",
|
136 |
+
"target_delimiter": " ",
|
137 |
+
"fewshot_delimiter": "\n\n",
|
138 |
+
"metric_list": [
|
139 |
+
{
|
140 |
+
"metric": "acc",
|
141 |
+
"aggregation": "mean",
|
142 |
+
"higher_is_better": true
|
143 |
+
},
|
144 |
+
{
|
145 |
+
"metric": "acc_norm",
|
146 |
+
"aggregation": "mean",
|
147 |
+
"higher_is_better": true
|
148 |
+
}
|
149 |
+
],
|
150 |
+
"output_type": "multiple_choice",
|
151 |
+
"repeats": 1,
|
152 |
+
"should_decontaminate": true,
|
153 |
+
"doc_to_decontamination_query": "{{question}}"
|
154 |
+
},
|
155 |
+
"medqa_4options_g2b": {
|
156 |
+
"task": "medqa_4options_g2b",
|
157 |
+
"dataset_path": "AIM-Harvard/gbaker_medqa_usmle_4_options_hf_generic_to_brand",
|
158 |
+
"training_split": "train",
|
159 |
+
"validation_split": "validation",
|
160 |
+
"test_split": "test",
|
161 |
+
"doc_to_text": "<function doc_to_text at 0x7f1d748a8820>",
|
162 |
+
"doc_to_target": "<function doc_to_target at 0x7f1d748a8b80>",
|
163 |
+
"doc_to_choice": [
|
164 |
+
"A",
|
165 |
+
"B",
|
166 |
+
"C",
|
167 |
+
"D"
|
168 |
+
],
|
169 |
+
"description": "",
|
170 |
+
"target_delimiter": " ",
|
171 |
+
"fewshot_delimiter": "\n\n",
|
172 |
+
"metric_list": [
|
173 |
+
{
|
174 |
+
"metric": "acc",
|
175 |
+
"aggregation": "mean",
|
176 |
+
"higher_is_better": true
|
177 |
+
},
|
178 |
+
{
|
179 |
+
"metric": "acc_norm",
|
180 |
+
"aggregation": "mean",
|
181 |
+
"higher_is_better": true
|
182 |
+
}
|
183 |
+
],
|
184 |
+
"output_type": "multiple_choice",
|
185 |
+
"repeats": 1,
|
186 |
+
"should_decontaminate": false
|
187 |
+
},
|
188 |
+
"medqa_4options_orig_filtered": {
|
189 |
+
"task": "medqa_4options_orig_filtered",
|
190 |
+
"dataset_path": "AIM-Harvard/gbaker_medqa_usmle_4_options_hf_original",
|
191 |
+
"training_split": "train",
|
192 |
+
"validation_split": "validation",
|
193 |
+
"test_split": "test",
|
194 |
+
"doc_to_text": "<function doc_to_text at 0x7f1d74962680>",
|
195 |
+
"doc_to_target": "<function doc_to_target at 0x7f1d74981120>",
|
196 |
+
"doc_to_choice": [
|
197 |
+
"A",
|
198 |
+
"B",
|
199 |
+
"C",
|
200 |
+
"D"
|
201 |
+
],
|
202 |
+
"description": "",
|
203 |
+
"target_delimiter": " ",
|
204 |
+
"fewshot_delimiter": "\n\n",
|
205 |
+
"metric_list": [
|
206 |
+
{
|
207 |
+
"metric": "acc",
|
208 |
+
"aggregation": "mean",
|
209 |
+
"higher_is_better": true
|
210 |
+
},
|
211 |
+
{
|
212 |
+
"metric": "acc_norm",
|
213 |
+
"aggregation": "mean",
|
214 |
+
"higher_is_better": true
|
215 |
+
}
|
216 |
+
],
|
217 |
+
"output_type": "multiple_choice",
|
218 |
+
"repeats": 1,
|
219 |
+
"should_decontaminate": false
|
220 |
+
}
|
221 |
+
},
|
222 |
+
"versions": {
|
223 |
+
"b4b": "N/A",
|
224 |
+
"b4bqa": "Yaml",
|
225 |
+
"medmcqa_g2b": "Yaml",
|
226 |
+
"medmcqa_orig_filtered": "Yaml",
|
227 |
+
"medqa_4options_g2b": "Yaml",
|
228 |
+
"medqa_4options_orig_filtered": "Yaml"
|
229 |
+
},
|
230 |
+
"n-shot": {
|
231 |
+
"b4b": 0,
|
232 |
+
"b4bqa": 0,
|
233 |
+
"medmcqa_g2b": 0,
|
234 |
+
"medmcqa_orig_filtered": 0,
|
235 |
+
"medqa_4options_g2b": 0,
|
236 |
+
"medqa_4options_orig_filtered": 0
|
237 |
+
},
|
238 |
+
"config": {
|
239 |
+
"model": "hf",
|
240 |
+
"model_args": "pretrained=aaditya/Llama3-OpenBioLLM-70B,parallelize=True,load_in_4bit=True",
|
241 |
+
"batch_size": "auto",
|
242 |
+
"batch_sizes": [
|
243 |
+
32
|
244 |
+
],
|
245 |
+
"device": null,
|
246 |
+
"use_cache": null,
|
247 |
+
"limit": null,
|
248 |
+
"bootstrap_iters": 100000,
|
249 |
+
"gen_kwargs": null
|
250 |
+
},
|
251 |
+
"git_hash": "928c7657"
|
252 |
+
}
|
data/raw-eval-outputs/johnsnowlabs-JSL-MedLlama-3-8B-v9_results.json
ADDED
@@ -0,0 +1,252 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"b4b": {
|
4 |
+
"acc,none": 0.7444512946979038,
|
5 |
+
"acc_stderr,none": 0.04274747119698657,
|
6 |
+
"acc_norm,none": 0.7444512946979038,
|
7 |
+
"acc_norm_stderr,none": 0.04274747119698657,
|
8 |
+
"alias": "b4b"
|
9 |
+
},
|
10 |
+
"b4bqa": {
|
11 |
+
"acc,none": 0.7516741071428571,
|
12 |
+
"acc_stderr,none": 0.010208877794084196,
|
13 |
+
"acc_norm,none": 0.7516741071428571,
|
14 |
+
"acc_norm_stderr,none": 0.010208877794084196,
|
15 |
+
"alias": " - b4bqa"
|
16 |
+
},
|
17 |
+
"medmcqa_g2b": {
|
18 |
+
"acc,none": 0.6408045977011494,
|
19 |
+
"acc_stderr,none": 0.025755112822545917,
|
20 |
+
"acc_norm,none": 0.6408045977011494,
|
21 |
+
"acc_norm_stderr,none": 0.025755112822545917,
|
22 |
+
"alias": " - medmcqa_g2b"
|
23 |
+
},
|
24 |
+
"medmcqa_orig_filtered": {
|
25 |
+
"acc,none": 0.7701149425287356,
|
26 |
+
"acc_stderr,none": 0.022587512669518847,
|
27 |
+
"acc_norm,none": 0.7701149425287356,
|
28 |
+
"acc_norm_stderr,none": 0.022587512669518847,
|
29 |
+
"alias": " - medmcqa_orig_filtered"
|
30 |
+
},
|
31 |
+
"medqa_4options_g2b": {
|
32 |
+
"acc,none": 0.7063492063492064,
|
33 |
+
"acc_stderr,none": 0.023456037383982033,
|
34 |
+
"acc_norm,none": 0.7063492063492064,
|
35 |
+
"acc_norm_stderr,none": 0.023456037383982033,
|
36 |
+
"alias": " - medqa_4options_g2b"
|
37 |
+
},
|
38 |
+
"medqa_4options_orig_filtered": {
|
39 |
+
"acc,none": 0.8201058201058201,
|
40 |
+
"acc_stderr,none": 0.01978211983276641,
|
41 |
+
"acc_norm,none": 0.8201058201058201,
|
42 |
+
"acc_norm_stderr,none": 0.01978211983276641,
|
43 |
+
"alias": " - medqa_4options_orig_filtered"
|
44 |
+
}
|
45 |
+
},
|
46 |
+
"groups": {
|
47 |
+
"b4b": {
|
48 |
+
"acc,none": 0.7444512946979038,
|
49 |
+
"acc_stderr,none": 0.04274747119698657,
|
50 |
+
"acc_norm,none": 0.7444512946979038,
|
51 |
+
"acc_norm_stderr,none": 0.04274747119698657,
|
52 |
+
"alias": "b4b"
|
53 |
+
}
|
54 |
+
},
|
55 |
+
"configs": {
|
56 |
+
"b4bqa": {
|
57 |
+
"task": "b4bqa",
|
58 |
+
"dataset_path": "AIM-Harvard/b4b_drug_qa",
|
59 |
+
"test_split": "test",
|
60 |
+
"doc_to_text": "<function process_cd at 0x78639f2e7040>",
|
61 |
+
"doc_to_target": "correct_choice",
|
62 |
+
"doc_to_choice": [
|
63 |
+
"A",
|
64 |
+
"B",
|
65 |
+
"C",
|
66 |
+
"D"
|
67 |
+
],
|
68 |
+
"description": "",
|
69 |
+
"target_delimiter": " ",
|
70 |
+
"fewshot_delimiter": "\n\n",
|
71 |
+
"metric_list": [
|
72 |
+
{
|
73 |
+
"metric": "acc",
|
74 |
+
"aggregation": "mean",
|
75 |
+
"higher_is_better": true
|
76 |
+
},
|
77 |
+
{
|
78 |
+
"metric": "acc_norm",
|
79 |
+
"aggregation": "mean",
|
80 |
+
"higher_is_better": true
|
81 |
+
}
|
82 |
+
],
|
83 |
+
"output_type": "multiple_choice",
|
84 |
+
"repeats": 1,
|
85 |
+
"should_decontaminate": false
|
86 |
+
},
|
87 |
+
"medmcqa_g2b": {
|
88 |
+
"task": "medmcqa_g2b",
|
89 |
+
"dataset_path": "AIM-Harvard/medmcqa_generic_to_brand",
|
90 |
+
"training_split": "train",
|
91 |
+
"validation_split": "validation",
|
92 |
+
"test_split": "validation",
|
93 |
+
"doc_to_text": "<function doc_to_text at 0x78639ef36280>",
|
94 |
+
"doc_to_target": "cop",
|
95 |
+
"doc_to_choice": [
|
96 |
+
"A",
|
97 |
+
"B",
|
98 |
+
"C",
|
99 |
+
"D"
|
100 |
+
],
|
101 |
+
"description": "",
|
102 |
+
"target_delimiter": " ",
|
103 |
+
"fewshot_delimiter": "\n\n",
|
104 |
+
"metric_list": [
|
105 |
+
{
|
106 |
+
"metric": "acc",
|
107 |
+
"aggregation": "mean",
|
108 |
+
"higher_is_better": true
|
109 |
+
},
|
110 |
+
{
|
111 |
+
"metric": "acc_norm",
|
112 |
+
"aggregation": "mean",
|
113 |
+
"higher_is_better": true
|
114 |
+
}
|
115 |
+
],
|
116 |
+
"output_type": "multiple_choice",
|
117 |
+
"repeats": 1,
|
118 |
+
"should_decontaminate": true,
|
119 |
+
"doc_to_decontamination_query": "{{question}}"
|
120 |
+
},
|
121 |
+
"medmcqa_orig_filtered": {
|
122 |
+
"task": "medmcqa_orig_filtered",
|
123 |
+
"dataset_path": "AIM-Harvard/medmcqa_original",
|
124 |
+
"training_split": "train",
|
125 |
+
"validation_split": "validation",
|
126 |
+
"test_split": "validation",
|
127 |
+
"doc_to_text": "<function doc_to_text at 0x78639f2d9e50>",
|
128 |
+
"doc_to_target": "cop",
|
129 |
+
"doc_to_choice": [
|
130 |
+
"A",
|
131 |
+
"B",
|
132 |
+
"C",
|
133 |
+
"D"
|
134 |
+
],
|
135 |
+
"description": "",
|
136 |
+
"target_delimiter": " ",
|
137 |
+
"fewshot_delimiter": "\n\n",
|
138 |
+
"metric_list": [
|
139 |
+
{
|
140 |
+
"metric": "acc",
|
141 |
+
"aggregation": "mean",
|
142 |
+
"higher_is_better": true
|
143 |
+
},
|
144 |
+
{
|
145 |
+
"metric": "acc_norm",
|
146 |
+
"aggregation": "mean",
|
147 |
+
"higher_is_better": true
|
148 |
+
}
|
149 |
+
],
|
150 |
+
"output_type": "multiple_choice",
|
151 |
+
"repeats": 1,
|
152 |
+
"should_decontaminate": true,
|
153 |
+
"doc_to_decontamination_query": "{{question}}"
|
154 |
+
},
|
155 |
+
"medqa_4options_g2b": {
|
156 |
+
"task": "medqa_4options_g2b",
|
157 |
+
"dataset_path": "AIM-Harvard/gbaker_medqa_usmle_4_options_hf_generic_to_brand",
|
158 |
+
"training_split": "train",
|
159 |
+
"validation_split": "validation",
|
160 |
+
"test_split": "test",
|
161 |
+
"doc_to_text": "<function doc_to_text at 0x78639f24a0d0>",
|
162 |
+
"doc_to_target": "<function doc_to_target at 0x78639f24a550>",
|
163 |
+
"doc_to_choice": [
|
164 |
+
"A",
|
165 |
+
"B",
|
166 |
+
"C",
|
167 |
+
"D"
|
168 |
+
],
|
169 |
+
"description": "",
|
170 |
+
"target_delimiter": " ",
|
171 |
+
"fewshot_delimiter": "\n\n",
|
172 |
+
"metric_list": [
|
173 |
+
{
|
174 |
+
"metric": "acc",
|
175 |
+
"aggregation": "mean",
|
176 |
+
"higher_is_better": true
|
177 |
+
},
|
178 |
+
{
|
179 |
+
"metric": "acc_norm",
|
180 |
+
"aggregation": "mean",
|
181 |
+
"higher_is_better": true
|
182 |
+
}
|
183 |
+
],
|
184 |
+
"output_type": "multiple_choice",
|
185 |
+
"repeats": 1,
|
186 |
+
"should_decontaminate": false
|
187 |
+
},
|
188 |
+
"medqa_4options_orig_filtered": {
|
189 |
+
"task": "medqa_4options_orig_filtered",
|
190 |
+
"dataset_path": "AIM-Harvard/gbaker_medqa_usmle_4_options_hf_original",
|
191 |
+
"training_split": "train",
|
192 |
+
"validation_split": "validation",
|
193 |
+
"test_split": "test",
|
194 |
+
"doc_to_text": "<function doc_to_text at 0x78639f24a670>",
|
195 |
+
"doc_to_target": "<function doc_to_target at 0x78639f24a8b0>",
|
196 |
+
"doc_to_choice": [
|
197 |
+
"A",
|
198 |
+
"B",
|
199 |
+
"C",
|
200 |
+
"D"
|
201 |
+
],
|
202 |
+
"description": "",
|
203 |
+
"target_delimiter": " ",
|
204 |
+
"fewshot_delimiter": "\n\n",
|
205 |
+
"metric_list": [
|
206 |
+
{
|
207 |
+
"metric": "acc",
|
208 |
+
"aggregation": "mean",
|
209 |
+
"higher_is_better": true
|
210 |
+
},
|
211 |
+
{
|
212 |
+
"metric": "acc_norm",
|
213 |
+
"aggregation": "mean",
|
214 |
+
"higher_is_better": true
|
215 |
+
}
|
216 |
+
],
|
217 |
+
"output_type": "multiple_choice",
|
218 |
+
"repeats": 1,
|
219 |
+
"should_decontaminate": false
|
220 |
+
}
|
221 |
+
},
|
222 |
+
"versions": {
|
223 |
+
"b4b": "N/A",
|
224 |
+
"b4bqa": "Yaml",
|
225 |
+
"medmcqa_g2b": "Yaml",
|
226 |
+
"medmcqa_orig_filtered": "Yaml",
|
227 |
+
"medqa_4options_g2b": "Yaml",
|
228 |
+
"medqa_4options_orig_filtered": "Yaml"
|
229 |
+
},
|
230 |
+
"n-shot": {
|
231 |
+
"b4b": 0,
|
232 |
+
"b4bqa": 0,
|
233 |
+
"medmcqa_g2b": 0,
|
234 |
+
"medmcqa_orig_filtered": 0,
|
235 |
+
"medqa_4options_g2b": 0,
|
236 |
+
"medqa_4options_orig_filtered": 0
|
237 |
+
},
|
238 |
+
"config": {
|
239 |
+
"model": "hf",
|
240 |
+
"model_args": "pretrained=johnsnowlabs/JSL-MedLlama-3-8B-v9,parallelize=True,load_in_4bit=True",
|
241 |
+
"batch_size": "auto",
|
242 |
+
"batch_sizes": [
|
243 |
+
8
|
244 |
+
],
|
245 |
+
"device": null,
|
246 |
+
"use_cache": null,
|
247 |
+
"limit": null,
|
248 |
+
"bootstrap_iters": 100000,
|
249 |
+
"gen_kwargs": null
|
250 |
+
},
|
251 |
+
"git_hash": "a6ca0b90"
|
252 |
+
}
|
data/raw-eval-outputs/meta-llama-Llama-2-70B-hf_results.json
ADDED
@@ -0,0 +1,250 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"b4b": {
|
4 |
+
"acc,none": 0.656288532675709,
|
5 |
+
"acc_stderr,none": 0.11099422321488661,
|
6 |
+
"acc_norm,none": 0.656288532675709,
|
7 |
+
"acc_norm_stderr,none": 0.11099422321488661,
|
8 |
+
"alias": "b4b"
|
9 |
+
},
|
10 |
+
"b4bqa": {
|
11 |
+
"acc,none": 0.7700892857142857,
|
12 |
+
"acc_stderr,none": 0.009942654607749084,
|
13 |
+
"acc_norm,none": 0.7700892857142857,
|
14 |
+
"acc_norm_stderr,none": 0.009942654607749084,
|
15 |
+
"alias": " - b4bqa"
|
16 |
+
},
|
17 |
+
"medmcqa_g2b": {
|
18 |
+
"acc,none": 0.45977011494252873,
|
19 |
+
"acc_stderr,none": 0.026754382675705738,
|
20 |
+
"acc_norm,none": 0.45977011494252873,
|
21 |
+
"acc_norm_stderr,none": 0.026754382675705738,
|
22 |
+
"alias": " - medmcqa_g2b"
|
23 |
+
},
|
24 |
+
"medmcqa_orig_filtered": {
|
25 |
+
"acc,none": 0.5229885057471264,
|
26 |
+
"acc_stderr,none": 0.026813021515239517,
|
27 |
+
"acc_norm,none": 0.5229885057471264,
|
28 |
+
"acc_norm_stderr,none": 0.026813021515239517,
|
29 |
+
"alias": " - medmcqa_orig_filtered"
|
30 |
+
},
|
31 |
+
"medqa_4options_g2b": {
|
32 |
+
"acc,none": 0.5264550264550265,
|
33 |
+
"acc_stderr,none": 0.025715239811346758,
|
34 |
+
"acc_norm,none": 0.5264550264550265,
|
35 |
+
"acc_norm_stderr,none": 0.025715239811346758,
|
36 |
+
"alias": " - medqa_4options_g2b"
|
37 |
+
},
|
38 |
+
"medqa_4options_orig_filtered": {
|
39 |
+
"acc,none": 0.5502645502645502,
|
40 |
+
"acc_stderr,none": 0.02562085704293665,
|
41 |
+
"acc_norm,none": 0.5502645502645502,
|
42 |
+
"acc_norm_stderr,none": 0.02562085704293665,
|
43 |
+
"alias": " - medqa_4options_orig_filtered"
|
44 |
+
}
|
45 |
+
},
|
46 |
+
"groups": {
|
47 |
+
"b4b": {
|
48 |
+
"acc,none": 0.656288532675709,
|
49 |
+
"acc_stderr,none": 0.11099422321488661,
|
50 |
+
"acc_norm,none": 0.656288532675709,
|
51 |
+
"acc_norm_stderr,none": 0.11099422321488661,
|
52 |
+
"alias": "b4b"
|
53 |
+
}
|
54 |
+
},
|
55 |
+
"configs": {
|
56 |
+
"b4bqa": {
|
57 |
+
"task": "b4bqa",
|
58 |
+
"dataset_path": "AIM-Harvard/b4b_drug_qa",
|
59 |
+
"test_split": "test",
|
60 |
+
"doc_to_text": "<function process_cd at 0x7fc9f1d3d090>",
|
61 |
+
"doc_to_target": "correct_choice",
|
62 |
+
"doc_to_choice": [
|
63 |
+
"A",
|
64 |
+
"B",
|
65 |
+
"C",
|
66 |
+
"D"
|
67 |
+
],
|
68 |
+
"description": "",
|
69 |
+
"target_delimiter": " ",
|
70 |
+
"fewshot_delimiter": "\n\n",
|
71 |
+
"metric_list": [
|
72 |
+
{
|
73 |
+
"metric": "acc",
|
74 |
+
"aggregation": "mean",
|
75 |
+
"higher_is_better": true
|
76 |
+
},
|
77 |
+
{
|
78 |
+
"metric": "acc_norm",
|
79 |
+
"aggregation": "mean",
|
80 |
+
"higher_is_better": true
|
81 |
+
}
|
82 |
+
],
|
83 |
+
"output_type": "multiple_choice",
|
84 |
+
"repeats": 1,
|
85 |
+
"should_decontaminate": false
|
86 |
+
},
|
87 |
+
"medmcqa_g2b": {
|
88 |
+
"task": "medmcqa_g2b",
|
89 |
+
"dataset_path": "AIM-Harvard/medmcqa_generic_to_brand",
|
90 |
+
"training_split": "train",
|
91 |
+
"validation_split": "validation",
|
92 |
+
"test_split": "validation",
|
93 |
+
"doc_to_text": "<function doc_to_text at 0x7fc9f0108550>",
|
94 |
+
"doc_to_target": "cop",
|
95 |
+
"doc_to_choice": [
|
96 |
+
"A",
|
97 |
+
"B",
|
98 |
+
"C",
|
99 |
+
"D"
|
100 |
+
],
|
101 |
+
"description": "",
|
102 |
+
"target_delimiter": " ",
|
103 |
+
"fewshot_delimiter": "\n\n",
|
104 |
+
"metric_list": [
|
105 |
+
{
|
106 |
+
"metric": "acc",
|
107 |
+
"aggregation": "mean",
|
108 |
+
"higher_is_better": true
|
109 |
+
},
|
110 |
+
{
|
111 |
+
"metric": "acc_norm",
|
112 |
+
"aggregation": "mean",
|
113 |
+
"higher_is_better": true
|
114 |
+
}
|
115 |
+
],
|
116 |
+
"output_type": "multiple_choice",
|
117 |
+
"repeats": 1,
|
118 |
+
"should_decontaminate": true,
|
119 |
+
"doc_to_decontamination_query": "{{question}}"
|
120 |
+
},
|
121 |
+
"medmcqa_orig_filtered": {
|
122 |
+
"task": "medmcqa_orig_filtered",
|
123 |
+
"dataset_path": "AIM-Harvard/medmcqa_original",
|
124 |
+
"training_split": "train",
|
125 |
+
"validation_split": "validation",
|
126 |
+
"test_split": "validation",
|
127 |
+
"doc_to_text": "<function doc_to_text at 0x7fc9f0108700>",
|
128 |
+
"doc_to_target": "cop",
|
129 |
+
"doc_to_choice": [
|
130 |
+
"A",
|
131 |
+
"B",
|
132 |
+
"C",
|
133 |
+
"D"
|
134 |
+
],
|
135 |
+
"description": "",
|
136 |
+
"target_delimiter": " ",
|
137 |
+
"fewshot_delimiter": "\n\n",
|
138 |
+
"metric_list": [
|
139 |
+
{
|
140 |
+
"metric": "acc",
|
141 |
+
"aggregation": "mean",
|
142 |
+
"higher_is_better": true
|
143 |
+
},
|
144 |
+
{
|
145 |
+
"metric": "acc_norm",
|
146 |
+
"aggregation": "mean",
|
147 |
+
"higher_is_better": true
|
148 |
+
}
|
149 |
+
],
|
150 |
+
"output_type": "multiple_choice",
|
151 |
+
"repeats": 1,
|
152 |
+
"should_decontaminate": true,
|
153 |
+
"doc_to_decontamination_query": "{{question}}"
|
154 |
+
},
|
155 |
+
"medqa_4options_g2b": {
|
156 |
+
"task": "medqa_4options_g2b",
|
157 |
+
"dataset_path": "AIM-Harvard/gbaker_medqa_usmle_4_options_hf_generic_to_brand",
|
158 |
+
"training_split": "train",
|
159 |
+
"validation_split": "validation",
|
160 |
+
"test_split": "test",
|
161 |
+
"doc_to_text": "<function doc_to_text at 0x7fc9f010b6d0>",
|
162 |
+
"doc_to_target": "<function doc_to_target at 0x7fc9f010ba30>",
|
163 |
+
"doc_to_choice": [
|
164 |
+
"A",
|
165 |
+
"B",
|
166 |
+
"C",
|
167 |
+
"D"
|
168 |
+
],
|
169 |
+
"description": "",
|
170 |
+
"target_delimiter": " ",
|
171 |
+
"fewshot_delimiter": "\n\n",
|
172 |
+
"metric_list": [
|
173 |
+
{
|
174 |
+
"metric": "acc",
|
175 |
+
"aggregation": "mean",
|
176 |
+
"higher_is_better": true
|
177 |
+
},
|
178 |
+
{
|
179 |
+
"metric": "acc_norm",
|
180 |
+
"aggregation": "mean",
|
181 |
+
"higher_is_better": true
|
182 |
+
}
|
183 |
+
],
|
184 |
+
"output_type": "multiple_choice",
|
185 |
+
"repeats": 1,
|
186 |
+
"should_decontaminate": false
|
187 |
+
},
|
188 |
+
"medqa_4options_orig_filtered": {
|
189 |
+
"task": "medqa_4options_orig_filtered",
|
190 |
+
"dataset_path": "AIM-Harvard/gbaker_medqa_usmle_4_options_hf_original",
|
191 |
+
"training_split": "train",
|
192 |
+
"validation_split": "validation",
|
193 |
+
"test_split": "test",
|
194 |
+
"doc_to_text": "<function doc_to_text at 0x7fc9f010bc70>",
|
195 |
+
"doc_to_target": "<function doc_to_target at 0x7fc9f010beb0>",
|
196 |
+
"doc_to_choice": [
|
197 |
+
"A",
|
198 |
+
"B",
|
199 |
+
"C",
|
200 |
+
"D"
|
201 |
+
],
|
202 |
+
"description": "",
|
203 |
+
"target_delimiter": " ",
|
204 |
+
"fewshot_delimiter": "\n\n",
|
205 |
+
"metric_list": [
|
206 |
+
{
|
207 |
+
"metric": "acc",
|
208 |
+
"aggregation": "mean",
|
209 |
+
"higher_is_better": true
|
210 |
+
},
|
211 |
+
{
|
212 |
+
"metric": "acc_norm",
|
213 |
+
"aggregation": "mean",
|
214 |
+
"higher_is_better": true
|
215 |
+
}
|
216 |
+
],
|
217 |
+
"output_type": "multiple_choice",
|
218 |
+
"repeats": 1,
|
219 |
+
"should_decontaminate": false
|
220 |
+
}
|
221 |
+
},
|
222 |
+
"versions": {
|
223 |
+
"b4b": "N/A",
|
224 |
+
"b4bqa": "Yaml",
|
225 |
+
"medmcqa_g2b": "Yaml",
|
226 |
+
"medmcqa_orig_filtered": "Yaml",
|
227 |
+
"medqa_4options_g2b": "Yaml",
|
228 |
+
"medqa_4options_orig_filtered": "Yaml"
|
229 |
+
},
|
230 |
+
"n-shot": {
|
231 |
+
"b4b": 0,
|
232 |
+
"b4bqa": 0,
|
233 |
+
"medmcqa_g2b": 0,
|
234 |
+
"medmcqa_orig_filtered": 0,
|
235 |
+
"medqa_4options_g2b": 0,
|
236 |
+
"medqa_4options_orig_filtered": 0
|
237 |
+
},
|
238 |
+
"config": {
|
239 |
+
"model": "hf",
|
240 |
+
"model_args": "pretrained=meta-llama/Llama-2-70B-hf,load_in_4bit=True",
|
241 |
+
"batch_size": "4",
|
242 |
+
"batch_sizes": [],
|
243 |
+
"device": "cuda:0",
|
244 |
+
"use_cache": null,
|
245 |
+
"limit": null,
|
246 |
+
"bootstrap_iters": 100000,
|
247 |
+
"gen_kwargs": null
|
248 |
+
},
|
249 |
+
"git_hash": "928c7657"
|
250 |
+
}
|
data/raw-eval-outputs/meta-llama-Llama-2-7b-hf_results.json
ADDED
@@ -0,0 +1,250 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"b4b": {
|
4 |
+
"acc,none": 0.3600493218249075,
|
5 |
+
"acc_stderr,none": 0.021816304388272503,
|
6 |
+
"acc_norm,none": 0.3600493218249075,
|
7 |
+
"acc_norm_stderr,none": 0.021816304388272503,
|
8 |
+
"alias": "b4b"
|
9 |
+
},
|
10 |
+
"b4bqa": {
|
11 |
+
"acc,none": 0.36830357142857145,
|
12 |
+
"acc_stderr,none": 0.011397494280772988,
|
13 |
+
"acc_norm,none": 0.36830357142857145,
|
14 |
+
"acc_norm_stderr,none": 0.011397494280772988,
|
15 |
+
"alias": " - b4bqa"
|
16 |
+
},
|
17 |
+
"medmcqa_g2b": {
|
18 |
+
"acc,none": 0.3390804597701149,
|
19 |
+
"acc_stderr,none": 0.02541329280547327,
|
20 |
+
"acc_norm,none": 0.3390804597701149,
|
21 |
+
"acc_norm_stderr,none": 0.02541329280547327,
|
22 |
+
"alias": " - medmcqa_g2b"
|
23 |
+
},
|
24 |
+
"medmcqa_orig_filtered": {
|
25 |
+
"acc,none": 0.34195402298850575,
|
26 |
+
"acc_stderr,none": 0.025465208743331563,
|
27 |
+
"acc_norm,none": 0.34195402298850575,
|
28 |
+
"acc_norm_stderr,none": 0.025465208743331563,
|
29 |
+
"alias": " - medmcqa_orig_filtered"
|
30 |
+
},
|
31 |
+
"medqa_4options_g2b": {
|
32 |
+
"acc,none": 0.3439153439153439,
|
33 |
+
"acc_stderr,none": 0.024464426625596437,
|
34 |
+
"acc_norm,none": 0.3439153439153439,
|
35 |
+
"acc_norm_stderr,none": 0.024464426625596437,
|
36 |
+
"alias": " - medqa_4options_g2b"
|
37 |
+
},
|
38 |
+
"medqa_4options_orig_filtered": {
|
39 |
+
"acc,none": 0.373015873015873,
|
40 |
+
"acc_stderr,none": 0.02490699045899257,
|
41 |
+
"acc_norm,none": 0.373015873015873,
|
42 |
+
"acc_norm_stderr,none": 0.02490699045899257,
|
43 |
+
"alias": " - medqa_4options_orig_filtered"
|
44 |
+
}
|
45 |
+
},
|
46 |
+
"groups": {
|
47 |
+
"b4b": {
|
48 |
+
"acc,none": 0.3600493218249075,
|
49 |
+
"acc_stderr,none": 0.021816304388272503,
|
50 |
+
"acc_norm,none": 0.3600493218249075,
|
51 |
+
"acc_norm_stderr,none": 0.021816304388272503,
|
52 |
+
"alias": "b4b"
|
53 |
+
}
|
54 |
+
},
|
55 |
+
"configs": {
|
56 |
+
"b4bqa": {
|
57 |
+
"task": "b4bqa",
|
58 |
+
"dataset_path": "AIM-Harvard/b4b_drug_qa",
|
59 |
+
"test_split": "test",
|
60 |
+
"doc_to_text": "<function process_cd at 0x7f9fc69011b0>",
|
61 |
+
"doc_to_target": "correct_choice",
|
62 |
+
"doc_to_choice": [
|
63 |
+
"A",
|
64 |
+
"B",
|
65 |
+
"C",
|
66 |
+
"D"
|
67 |
+
],
|
68 |
+
"description": "",
|
69 |
+
"target_delimiter": " ",
|
70 |
+
"fewshot_delimiter": "\n\n",
|
71 |
+
"metric_list": [
|
72 |
+
{
|
73 |
+
"metric": "acc",
|
74 |
+
"aggregation": "mean",
|
75 |
+
"higher_is_better": true
|
76 |
+
},
|
77 |
+
{
|
78 |
+
"metric": "acc_norm",
|
79 |
+
"aggregation": "mean",
|
80 |
+
"higher_is_better": true
|
81 |
+
}
|
82 |
+
],
|
83 |
+
"output_type": "multiple_choice",
|
84 |
+
"repeats": 1,
|
85 |
+
"should_decontaminate": false
|
86 |
+
},
|
87 |
+
"medmcqa_g2b": {
|
88 |
+
"task": "medmcqa_g2b",
|
89 |
+
"dataset_path": "AIM-Harvard/medmcqa_generic_to_brand",
|
90 |
+
"training_split": "train",
|
91 |
+
"validation_split": "validation",
|
92 |
+
"test_split": "validation",
|
93 |
+
"doc_to_text": "<function doc_to_text at 0x7f9fc4d94670>",
|
94 |
+
"doc_to_target": "cop",
|
95 |
+
"doc_to_choice": [
|
96 |
+
"A",
|
97 |
+
"B",
|
98 |
+
"C",
|
99 |
+
"D"
|
100 |
+
],
|
101 |
+
"description": "",
|
102 |
+
"target_delimiter": " ",
|
103 |
+
"fewshot_delimiter": "\n\n",
|
104 |
+
"metric_list": [
|
105 |
+
{
|
106 |
+
"metric": "acc",
|
107 |
+
"aggregation": "mean",
|
108 |
+
"higher_is_better": true
|
109 |
+
},
|
110 |
+
{
|
111 |
+
"metric": "acc_norm",
|
112 |
+
"aggregation": "mean",
|
113 |
+
"higher_is_better": true
|
114 |
+
}
|
115 |
+
],
|
116 |
+
"output_type": "multiple_choice",
|
117 |
+
"repeats": 1,
|
118 |
+
"should_decontaminate": true,
|
119 |
+
"doc_to_decontamination_query": "{{question}}"
|
120 |
+
},
|
121 |
+
"medmcqa_orig_filtered": {
|
122 |
+
"task": "medmcqa_orig_filtered",
|
123 |
+
"dataset_path": "AIM-Harvard/medmcqa_original",
|
124 |
+
"training_split": "train",
|
125 |
+
"validation_split": "validation",
|
126 |
+
"test_split": "validation",
|
127 |
+
"doc_to_text": "<function doc_to_text at 0x7f9fc4d94820>",
|
128 |
+
"doc_to_target": "cop",
|
129 |
+
"doc_to_choice": [
|
130 |
+
"A",
|
131 |
+
"B",
|
132 |
+
"C",
|
133 |
+
"D"
|
134 |
+
],
|
135 |
+
"description": "",
|
136 |
+
"target_delimiter": " ",
|
137 |
+
"fewshot_delimiter": "\n\n",
|
138 |
+
"metric_list": [
|
139 |
+
{
|
140 |
+
"metric": "acc",
|
141 |
+
"aggregation": "mean",
|
142 |
+
"higher_is_better": true
|
143 |
+
},
|
144 |
+
{
|
145 |
+
"metric": "acc_norm",
|
146 |
+
"aggregation": "mean",
|
147 |
+
"higher_is_better": true
|
148 |
+
}
|
149 |
+
],
|
150 |
+
"output_type": "multiple_choice",
|
151 |
+
"repeats": 1,
|
152 |
+
"should_decontaminate": true,
|
153 |
+
"doc_to_decontamination_query": "{{question}}"
|
154 |
+
},
|
155 |
+
"medqa_4options_g2b": {
|
156 |
+
"task": "medqa_4options_g2b",
|
157 |
+
"dataset_path": "AIM-Harvard/gbaker_medqa_usmle_4_options_hf_generic_to_brand",
|
158 |
+
"training_split": "train",
|
159 |
+
"validation_split": "validation",
|
160 |
+
"test_split": "test",
|
161 |
+
"doc_to_text": "<function doc_to_text at 0x7f9fc4d977f0>",
|
162 |
+
"doc_to_target": "<function doc_to_target at 0x7f9fc4d97b50>",
|
163 |
+
"doc_to_choice": [
|
164 |
+
"A",
|
165 |
+
"B",
|
166 |
+
"C",
|
167 |
+
"D"
|
168 |
+
],
|
169 |
+
"description": "",
|
170 |
+
"target_delimiter": " ",
|
171 |
+
"fewshot_delimiter": "\n\n",
|
172 |
+
"metric_list": [
|
173 |
+
{
|
174 |
+
"metric": "acc",
|
175 |
+
"aggregation": "mean",
|
176 |
+
"higher_is_better": true
|
177 |
+
},
|
178 |
+
{
|
179 |
+
"metric": "acc_norm",
|
180 |
+
"aggregation": "mean",
|
181 |
+
"higher_is_better": true
|
182 |
+
}
|
183 |
+
],
|
184 |
+
"output_type": "multiple_choice",
|
185 |
+
"repeats": 1,
|
186 |
+
"should_decontaminate": false
|
187 |
+
},
|
188 |
+
"medqa_4options_orig_filtered": {
|
189 |
+
"task": "medqa_4options_orig_filtered",
|
190 |
+
"dataset_path": "AIM-Harvard/gbaker_medqa_usmle_4_options_hf_original",
|
191 |
+
"training_split": "train",
|
192 |
+
"validation_split": "validation",
|
193 |
+
"test_split": "test",
|
194 |
+
"doc_to_text": "<function doc_to_text at 0x7f9fc4d97d90>",
|
195 |
+
"doc_to_target": "<function doc_to_target at 0x7f9fc4db8040>",
|
196 |
+
"doc_to_choice": [
|
197 |
+
"A",
|
198 |
+
"B",
|
199 |
+
"C",
|
200 |
+
"D"
|
201 |
+
],
|
202 |
+
"description": "",
|
203 |
+
"target_delimiter": " ",
|
204 |
+
"fewshot_delimiter": "\n\n",
|
205 |
+
"metric_list": [
|
206 |
+
{
|
207 |
+
"metric": "acc",
|
208 |
+
"aggregation": "mean",
|
209 |
+
"higher_is_better": true
|
210 |
+
},
|
211 |
+
{
|
212 |
+
"metric": "acc_norm",
|
213 |
+
"aggregation": "mean",
|
214 |
+
"higher_is_better": true
|
215 |
+
}
|
216 |
+
],
|
217 |
+
"output_type": "multiple_choice",
|
218 |
+
"repeats": 1,
|
219 |
+
"should_decontaminate": false
|
220 |
+
}
|
221 |
+
},
|
222 |
+
"versions": {
|
223 |
+
"b4b": "N/A",
|
224 |
+
"b4bqa": "Yaml",
|
225 |
+
"medmcqa_g2b": "Yaml",
|
226 |
+
"medmcqa_orig_filtered": "Yaml",
|
227 |
+
"medqa_4options_g2b": "Yaml",
|
228 |
+
"medqa_4options_orig_filtered": "Yaml"
|
229 |
+
},
|
230 |
+
"n-shot": {
|
231 |
+
"b4b": 0,
|
232 |
+
"b4bqa": 0,
|
233 |
+
"medmcqa_g2b": 0,
|
234 |
+
"medmcqa_orig_filtered": 0,
|
235 |
+
"medqa_4options_g2b": 0,
|
236 |
+
"medqa_4options_orig_filtered": 0
|
237 |
+
},
|
238 |
+
"config": {
|
239 |
+
"model": "hf",
|
240 |
+
"model_args": "pretrained=meta-llama/Llama-2-7b-hf,load_in_4bit=True",
|
241 |
+
"batch_size": "4",
|
242 |
+
"batch_sizes": [],
|
243 |
+
"device": "cuda:0",
|
244 |
+
"use_cache": null,
|
245 |
+
"limit": null,
|
246 |
+
"bootstrap_iters": 100000,
|
247 |
+
"gen_kwargs": null
|
248 |
+
},
|
249 |
+
"git_hash": "928c7657"
|
250 |
+
}
|
data/raw-eval-outputs/meta-llama-Meta-Llama-3-70B_results.json
ADDED
@@ -0,0 +1,250 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"b4b": {
|
4 |
+
"acc,none": 0.8255240443896424,
|
5 |
+
"acc_stderr,none": 0.07700722588574725,
|
6 |
+
"acc_norm,none": 0.8255240443896424,
|
7 |
+
"acc_norm_stderr,none": 0.07700722588574725,
|
8 |
+
"alias": "b4b"
|
9 |
+
},
|
10 |
+
"b4bqa": {
|
11 |
+
"acc,none": 0.9012276785714286,
|
12 |
+
"acc_stderr,none": 0.007049967229617683,
|
13 |
+
"acc_norm,none": 0.9012276785714286,
|
14 |
+
"acc_norm_stderr,none": 0.007049967229617683,
|
15 |
+
"alias": " - b4bqa"
|
16 |
+
},
|
17 |
+
"medmcqa_g2b": {
|
18 |
+
"acc,none": 0.6666666666666666,
|
19 |
+
"acc_stderr,none": 0.025306320600037485,
|
20 |
+
"acc_norm,none": 0.6666666666666666,
|
21 |
+
"acc_norm_stderr,none": 0.025306320600037485,
|
22 |
+
"alias": " - medmcqa_g2b"
|
23 |
+
},
|
24 |
+
"medmcqa_orig_filtered": {
|
25 |
+
"acc,none": 0.7816091954022989,
|
26 |
+
"acc_stderr,none": 0.02217927096875997,
|
27 |
+
"acc_norm,none": 0.7816091954022989,
|
28 |
+
"acc_norm_stderr,none": 0.02217927096875997,
|
29 |
+
"alias": " - medmcqa_orig_filtered"
|
30 |
+
},
|
31 |
+
"medqa_4options_g2b": {
|
32 |
+
"acc,none": 0.7275132275132276,
|
33 |
+
"acc_stderr,none": 0.022930973071633363,
|
34 |
+
"acc_norm,none": 0.7275132275132276,
|
35 |
+
"acc_norm_stderr,none": 0.022930973071633363,
|
36 |
+
"alias": " - medqa_4options_g2b"
|
37 |
+
},
|
38 |
+
"medqa_4options_orig_filtered": {
|
39 |
+
"acc,none": 0.7513227513227513,
|
40 |
+
"acc_stderr,none": 0.022261817692400168,
|
41 |
+
"acc_norm,none": 0.7513227513227513,
|
42 |
+
"acc_norm_stderr,none": 0.022261817692400168,
|
43 |
+
"alias": " - medqa_4options_orig_filtered"
|
44 |
+
}
|
45 |
+
},
|
46 |
+
"groups": {
|
47 |
+
"b4b": {
|
48 |
+
"acc,none": 0.8255240443896424,
|
49 |
+
"acc_stderr,none": 0.07700722588574725,
|
50 |
+
"acc_norm,none": 0.8255240443896424,
|
51 |
+
"acc_norm_stderr,none": 0.07700722588574725,
|
52 |
+
"alias": "b4b"
|
53 |
+
}
|
54 |
+
},
|
55 |
+
"configs": {
|
56 |
+
"b4bqa": {
|
57 |
+
"task": "b4bqa",
|
58 |
+
"dataset_path": "AIM-Harvard/b4b_drug_qa",
|
59 |
+
"test_split": "test",
|
60 |
+
"doc_to_text": "<function process_cd at 0x7f572baed090>",
|
61 |
+
"doc_to_target": "correct_choice",
|
62 |
+
"doc_to_choice": [
|
63 |
+
"A",
|
64 |
+
"B",
|
65 |
+
"C",
|
66 |
+
"D"
|
67 |
+
],
|
68 |
+
"description": "",
|
69 |
+
"target_delimiter": " ",
|
70 |
+
"fewshot_delimiter": "\n\n",
|
71 |
+
"metric_list": [
|
72 |
+
{
|
73 |
+
"metric": "acc",
|
74 |
+
"aggregation": "mean",
|
75 |
+
"higher_is_better": true
|
76 |
+
},
|
77 |
+
{
|
78 |
+
"metric": "acc_norm",
|
79 |
+
"aggregation": "mean",
|
80 |
+
"higher_is_better": true
|
81 |
+
}
|
82 |
+
],
|
83 |
+
"output_type": "multiple_choice",
|
84 |
+
"repeats": 1,
|
85 |
+
"should_decontaminate": false
|
86 |
+
},
|
87 |
+
"medmcqa_g2b": {
|
88 |
+
"task": "medmcqa_g2b",
|
89 |
+
"dataset_path": "AIM-Harvard/medmcqa_generic_to_brand",
|
90 |
+
"training_split": "train",
|
91 |
+
"validation_split": "validation",
|
92 |
+
"test_split": "validation",
|
93 |
+
"doc_to_text": "<function doc_to_text at 0x7f5729f00550>",
|
94 |
+
"doc_to_target": "cop",
|
95 |
+
"doc_to_choice": [
|
96 |
+
"A",
|
97 |
+
"B",
|
98 |
+
"C",
|
99 |
+
"D"
|
100 |
+
],
|
101 |
+
"description": "",
|
102 |
+
"target_delimiter": " ",
|
103 |
+
"fewshot_delimiter": "\n\n",
|
104 |
+
"metric_list": [
|
105 |
+
{
|
106 |
+
"metric": "acc",
|
107 |
+
"aggregation": "mean",
|
108 |
+
"higher_is_better": true
|
109 |
+
},
|
110 |
+
{
|
111 |
+
"metric": "acc_norm",
|
112 |
+
"aggregation": "mean",
|
113 |
+
"higher_is_better": true
|
114 |
+
}
|
115 |
+
],
|
116 |
+
"output_type": "multiple_choice",
|
117 |
+
"repeats": 1,
|
118 |
+
"should_decontaminate": true,
|
119 |
+
"doc_to_decontamination_query": "{{question}}"
|
120 |
+
},
|
121 |
+
"medmcqa_orig_filtered": {
|
122 |
+
"task": "medmcqa_orig_filtered",
|
123 |
+
"dataset_path": "AIM-Harvard/medmcqa_original",
|
124 |
+
"training_split": "train",
|
125 |
+
"validation_split": "validation",
|
126 |
+
"test_split": "validation",
|
127 |
+
"doc_to_text": "<function doc_to_text at 0x7f5729f00700>",
|
128 |
+
"doc_to_target": "cop",
|
129 |
+
"doc_to_choice": [
|
130 |
+
"A",
|
131 |
+
"B",
|
132 |
+
"C",
|
133 |
+
"D"
|
134 |
+
],
|
135 |
+
"description": "",
|
136 |
+
"target_delimiter": " ",
|
137 |
+
"fewshot_delimiter": "\n\n",
|
138 |
+
"metric_list": [
|
139 |
+
{
|
140 |
+
"metric": "acc",
|
141 |
+
"aggregation": "mean",
|
142 |
+
"higher_is_better": true
|
143 |
+
},
|
144 |
+
{
|
145 |
+
"metric": "acc_norm",
|
146 |
+
"aggregation": "mean",
|
147 |
+
"higher_is_better": true
|
148 |
+
}
|
149 |
+
],
|
150 |
+
"output_type": "multiple_choice",
|
151 |
+
"repeats": 1,
|
152 |
+
"should_decontaminate": true,
|
153 |
+
"doc_to_decontamination_query": "{{question}}"
|
154 |
+
},
|
155 |
+
"medqa_4options_g2b": {
|
156 |
+
"task": "medqa_4options_g2b",
|
157 |
+
"dataset_path": "AIM-Harvard/gbaker_medqa_usmle_4_options_hf_generic_to_brand",
|
158 |
+
"training_split": "train",
|
159 |
+
"validation_split": "validation",
|
160 |
+
"test_split": "test",
|
161 |
+
"doc_to_text": "<function doc_to_text at 0x7f5729f036d0>",
|
162 |
+
"doc_to_target": "<function doc_to_target at 0x7f5729f03a30>",
|
163 |
+
"doc_to_choice": [
|
164 |
+
"A",
|
165 |
+
"B",
|
166 |
+
"C",
|
167 |
+
"D"
|
168 |
+
],
|
169 |
+
"description": "",
|
170 |
+
"target_delimiter": " ",
|
171 |
+
"fewshot_delimiter": "\n\n",
|
172 |
+
"metric_list": [
|
173 |
+
{
|
174 |
+
"metric": "acc",
|
175 |
+
"aggregation": "mean",
|
176 |
+
"higher_is_better": true
|
177 |
+
},
|
178 |
+
{
|
179 |
+
"metric": "acc_norm",
|
180 |
+
"aggregation": "mean",
|
181 |
+
"higher_is_better": true
|
182 |
+
}
|
183 |
+
],
|
184 |
+
"output_type": "multiple_choice",
|
185 |
+
"repeats": 1,
|
186 |
+
"should_decontaminate": false
|
187 |
+
},
|
188 |
+
"medqa_4options_orig_filtered": {
|
189 |
+
"task": "medqa_4options_orig_filtered",
|
190 |
+
"dataset_path": "AIM-Harvard/gbaker_medqa_usmle_4_options_hf_original",
|
191 |
+
"training_split": "train",
|
192 |
+
"validation_split": "validation",
|
193 |
+
"test_split": "test",
|
194 |
+
"doc_to_text": "<function doc_to_text at 0x7f5729f03c70>",
|
195 |
+
"doc_to_target": "<function doc_to_target at 0x7f5729f03eb0>",
|
196 |
+
"doc_to_choice": [
|
197 |
+
"A",
|
198 |
+
"B",
|
199 |
+
"C",
|
200 |
+
"D"
|
201 |
+
],
|
202 |
+
"description": "",
|
203 |
+
"target_delimiter": " ",
|
204 |
+
"fewshot_delimiter": "\n\n",
|
205 |
+
"metric_list": [
|
206 |
+
{
|
207 |
+
"metric": "acc",
|
208 |
+
"aggregation": "mean",
|
209 |
+
"higher_is_better": true
|
210 |
+
},
|
211 |
+
{
|
212 |
+
"metric": "acc_norm",
|
213 |
+
"aggregation": "mean",
|
214 |
+
"higher_is_better": true
|
215 |
+
}
|
216 |
+
],
|
217 |
+
"output_type": "multiple_choice",
|
218 |
+
"repeats": 1,
|
219 |
+
"should_decontaminate": false
|
220 |
+
}
|
221 |
+
},
|
222 |
+
"versions": {
|
223 |
+
"b4b": "N/A",
|
224 |
+
"b4bqa": "Yaml",
|
225 |
+
"medmcqa_g2b": "Yaml",
|
226 |
+
"medmcqa_orig_filtered": "Yaml",
|
227 |
+
"medqa_4options_g2b": "Yaml",
|
228 |
+
"medqa_4options_orig_filtered": "Yaml"
|
229 |
+
},
|
230 |
+
"n-shot": {
|
231 |
+
"b4b": 0,
|
232 |
+
"b4bqa": 0,
|
233 |
+
"medmcqa_g2b": 0,
|
234 |
+
"medmcqa_orig_filtered": 0,
|
235 |
+
"medqa_4options_g2b": 0,
|
236 |
+
"medqa_4options_orig_filtered": 0
|
237 |
+
},
|
238 |
+
"config": {
|
239 |
+
"model": "hf",
|
240 |
+
"model_args": "pretrained=meta-llama/Meta-Llama-3-70B,load_in_4bit=True",
|
241 |
+
"batch_size": "4",
|
242 |
+
"batch_sizes": [],
|
243 |
+
"device": "cuda:0",
|
244 |
+
"use_cache": null,
|
245 |
+
"limit": null,
|
246 |
+
"bootstrap_iters": 100000,
|
247 |
+
"gen_kwargs": null
|
248 |
+
},
|
249 |
+
"git_hash": "928c7657"
|
250 |
+
}
|
data/raw-eval-outputs/meta-llama-Meta-Llama-3-8B_results.json
ADDED
@@ -0,0 +1,250 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"b4b": {
|
4 |
+
"acc,none": 0.7120838471023427,
|
5 |
+
"acc_stderr,none": 0.11202233860795015,
|
6 |
+
"acc_norm,none": 0.7120838471023427,
|
7 |
+
"acc_norm_stderr,none": 0.11202233860795015,
|
8 |
+
"alias": "b4b"
|
9 |
+
},
|
10 |
+
"b4bqa": {
|
11 |
+
"acc,none": 0.8270089285714286,
|
12 |
+
"acc_stderr,none": 0.00893756370730241,
|
13 |
+
"acc_norm,none": 0.8270089285714286,
|
14 |
+
"acc_norm_stderr,none": 0.00893756370730241,
|
15 |
+
"alias": " - b4bqa"
|
16 |
+
},
|
17 |
+
"medmcqa_g2b": {
|
18 |
+
"acc,none": 0.5287356321839081,
|
19 |
+
"acc_stderr,none": 0.02679704183010415,
|
20 |
+
"acc_norm,none": 0.5287356321839081,
|
21 |
+
"acc_norm_stderr,none": 0.02679704183010415,
|
22 |
+
"alias": " - medmcqa_g2b"
|
23 |
+
},
|
24 |
+
"medmcqa_orig_filtered": {
|
25 |
+
"acc,none": 0.5919540229885057,
|
26 |
+
"acc_stderr,none": 0.026383584629731508,
|
27 |
+
"acc_norm,none": 0.5919540229885057,
|
28 |
+
"acc_norm_stderr,none": 0.026383584629731508,
|
29 |
+
"alias": " - medmcqa_orig_filtered"
|
30 |
+
},
|
31 |
+
"medqa_4options_g2b": {
|
32 |
+
"acc,none": 0.5502645502645502,
|
33 |
+
"acc_stderr,none": 0.025620857042936655,
|
34 |
+
"acc_norm,none": 0.5502645502645502,
|
35 |
+
"acc_norm_stderr,none": 0.025620857042936655,
|
36 |
+
"alias": " - medqa_4options_g2b"
|
37 |
+
},
|
38 |
+
"medqa_4options_orig_filtered": {
|
39 |
+
"acc,none": 0.6084656084656085,
|
40 |
+
"acc_stderr,none": 0.025138091388851102,
|
41 |
+
"acc_norm,none": 0.6084656084656085,
|
42 |
+
"acc_norm_stderr,none": 0.025138091388851102,
|
43 |
+
"alias": " - medqa_4options_orig_filtered"
|
44 |
+
}
|
45 |
+
},
|
46 |
+
"groups": {
|
47 |
+
"b4b": {
|
48 |
+
"acc,none": 0.7120838471023427,
|
49 |
+
"acc_stderr,none": 0.11202233860795015,
|
50 |
+
"acc_norm,none": 0.7120838471023427,
|
51 |
+
"acc_norm_stderr,none": 0.11202233860795015,
|
52 |
+
"alias": "b4b"
|
53 |
+
}
|
54 |
+
},
|
55 |
+
"configs": {
|
56 |
+
"b4bqa": {
|
57 |
+
"task": "b4bqa",
|
58 |
+
"dataset_path": "AIM-Harvard/b4b_drug_qa",
|
59 |
+
"test_split": "test",
|
60 |
+
"doc_to_text": "<function process_cd at 0x7ff55118d090>",
|
61 |
+
"doc_to_target": "correct_choice",
|
62 |
+
"doc_to_choice": [
|
63 |
+
"A",
|
64 |
+
"B",
|
65 |
+
"C",
|
66 |
+
"D"
|
67 |
+
],
|
68 |
+
"description": "",
|
69 |
+
"target_delimiter": " ",
|
70 |
+
"fewshot_delimiter": "\n\n",
|
71 |
+
"metric_list": [
|
72 |
+
{
|
73 |
+
"metric": "acc",
|
74 |
+
"aggregation": "mean",
|
75 |
+
"higher_is_better": true
|
76 |
+
},
|
77 |
+
{
|
78 |
+
"metric": "acc_norm",
|
79 |
+
"aggregation": "mean",
|
80 |
+
"higher_is_better": true
|
81 |
+
}
|
82 |
+
],
|
83 |
+
"output_type": "multiple_choice",
|
84 |
+
"repeats": 1,
|
85 |
+
"should_decontaminate": false
|
86 |
+
},
|
87 |
+
"medmcqa_g2b": {
|
88 |
+
"task": "medmcqa_g2b",
|
89 |
+
"dataset_path": "AIM-Harvard/medmcqa_generic_to_brand",
|
90 |
+
"training_split": "train",
|
91 |
+
"validation_split": "validation",
|
92 |
+
"test_split": "validation",
|
93 |
+
"doc_to_text": "<function doc_to_text at 0x7ff55058c550>",
|
94 |
+
"doc_to_target": "cop",
|
95 |
+
"doc_to_choice": [
|
96 |
+
"A",
|
97 |
+
"B",
|
98 |
+
"C",
|
99 |
+
"D"
|
100 |
+
],
|
101 |
+
"description": "",
|
102 |
+
"target_delimiter": " ",
|
103 |
+
"fewshot_delimiter": "\n\n",
|
104 |
+
"metric_list": [
|
105 |
+
{
|
106 |
+
"metric": "acc",
|
107 |
+
"aggregation": "mean",
|
108 |
+
"higher_is_better": true
|
109 |
+
},
|
110 |
+
{
|
111 |
+
"metric": "acc_norm",
|
112 |
+
"aggregation": "mean",
|
113 |
+
"higher_is_better": true
|
114 |
+
}
|
115 |
+
],
|
116 |
+
"output_type": "multiple_choice",
|
117 |
+
"repeats": 1,
|
118 |
+
"should_decontaminate": true,
|
119 |
+
"doc_to_decontamination_query": "{{question}}"
|
120 |
+
},
|
121 |
+
"medmcqa_orig_filtered": {
|
122 |
+
"task": "medmcqa_orig_filtered",
|
123 |
+
"dataset_path": "AIM-Harvard/medmcqa_original",
|
124 |
+
"training_split": "train",
|
125 |
+
"validation_split": "validation",
|
126 |
+
"test_split": "validation",
|
127 |
+
"doc_to_text": "<function doc_to_text at 0x7ff55058c700>",
|
128 |
+
"doc_to_target": "cop",
|
129 |
+
"doc_to_choice": [
|
130 |
+
"A",
|
131 |
+
"B",
|
132 |
+
"C",
|
133 |
+
"D"
|
134 |
+
],
|
135 |
+
"description": "",
|
136 |
+
"target_delimiter": " ",
|
137 |
+
"fewshot_delimiter": "\n\n",
|
138 |
+
"metric_list": [
|
139 |
+
{
|
140 |
+
"metric": "acc",
|
141 |
+
"aggregation": "mean",
|
142 |
+
"higher_is_better": true
|
143 |
+
},
|
144 |
+
{
|
145 |
+
"metric": "acc_norm",
|
146 |
+
"aggregation": "mean",
|
147 |
+
"higher_is_better": true
|
148 |
+
}
|
149 |
+
],
|
150 |
+
"output_type": "multiple_choice",
|
151 |
+
"repeats": 1,
|
152 |
+
"should_decontaminate": true,
|
153 |
+
"doc_to_decontamination_query": "{{question}}"
|
154 |
+
},
|
155 |
+
"medqa_4options_g2b": {
|
156 |
+
"task": "medqa_4options_g2b",
|
157 |
+
"dataset_path": "AIM-Harvard/gbaker_medqa_usmle_4_options_hf_generic_to_brand",
|
158 |
+
"training_split": "train",
|
159 |
+
"validation_split": "validation",
|
160 |
+
"test_split": "test",
|
161 |
+
"doc_to_text": "<function doc_to_text at 0x7ff55058f6d0>",
|
162 |
+
"doc_to_target": "<function doc_to_target at 0x7ff55058fa30>",
|
163 |
+
"doc_to_choice": [
|
164 |
+
"A",
|
165 |
+
"B",
|
166 |
+
"C",
|
167 |
+
"D"
|
168 |
+
],
|
169 |
+
"description": "",
|
170 |
+
"target_delimiter": " ",
|
171 |
+
"fewshot_delimiter": "\n\n",
|
172 |
+
"metric_list": [
|
173 |
+
{
|
174 |
+
"metric": "acc",
|
175 |
+
"aggregation": "mean",
|
176 |
+
"higher_is_better": true
|
177 |
+
},
|
178 |
+
{
|
179 |
+
"metric": "acc_norm",
|
180 |
+
"aggregation": "mean",
|
181 |
+
"higher_is_better": true
|
182 |
+
}
|
183 |
+
],
|
184 |
+
"output_type": "multiple_choice",
|
185 |
+
"repeats": 1,
|
186 |
+
"should_decontaminate": false
|
187 |
+
},
|
188 |
+
"medqa_4options_orig_filtered": {
|
189 |
+
"task": "medqa_4options_orig_filtered",
|
190 |
+
"dataset_path": "AIM-Harvard/gbaker_medqa_usmle_4_options_hf_original",
|
191 |
+
"training_split": "train",
|
192 |
+
"validation_split": "validation",
|
193 |
+
"test_split": "test",
|
194 |
+
"doc_to_text": "<function doc_to_text at 0x7ff55058fc70>",
|
195 |
+
"doc_to_target": "<function doc_to_target at 0x7ff55058feb0>",
|
196 |
+
"doc_to_choice": [
|
197 |
+
"A",
|
198 |
+
"B",
|
199 |
+
"C",
|
200 |
+
"D"
|
201 |
+
],
|
202 |
+
"description": "",
|
203 |
+
"target_delimiter": " ",
|
204 |
+
"fewshot_delimiter": "\n\n",
|
205 |
+
"metric_list": [
|
206 |
+
{
|
207 |
+
"metric": "acc",
|
208 |
+
"aggregation": "mean",
|
209 |
+
"higher_is_better": true
|
210 |
+
},
|
211 |
+
{
|
212 |
+
"metric": "acc_norm",
|
213 |
+
"aggregation": "mean",
|
214 |
+
"higher_is_better": true
|
215 |
+
}
|
216 |
+
],
|
217 |
+
"output_type": "multiple_choice",
|
218 |
+
"repeats": 1,
|
219 |
+
"should_decontaminate": false
|
220 |
+
}
|
221 |
+
},
|
222 |
+
"versions": {
|
223 |
+
"b4b": "N/A",
|
224 |
+
"b4bqa": "Yaml",
|
225 |
+
"medmcqa_g2b": "Yaml",
|
226 |
+
"medmcqa_orig_filtered": "Yaml",
|
227 |
+
"medqa_4options_g2b": "Yaml",
|
228 |
+
"medqa_4options_orig_filtered": "Yaml"
|
229 |
+
},
|
230 |
+
"n-shot": {
|
231 |
+
"b4b": 0,
|
232 |
+
"b4bqa": 0,
|
233 |
+
"medmcqa_g2b": 0,
|
234 |
+
"medmcqa_orig_filtered": 0,
|
235 |
+
"medqa_4options_g2b": 0,
|
236 |
+
"medqa_4options_orig_filtered": 0
|
237 |
+
},
|
238 |
+
"config": {
|
239 |
+
"model": "hf",
|
240 |
+
"model_args": "pretrained=meta-llama/Meta-Llama-3-8B,load_in_4bit=True",
|
241 |
+
"batch_size": "4",
|
242 |
+
"batch_sizes": [],
|
243 |
+
"device": "cuda:0",
|
244 |
+
"use_cache": null,
|
245 |
+
"limit": null,
|
246 |
+
"bootstrap_iters": 100000,
|
247 |
+
"gen_kwargs": null
|
248 |
+
},
|
249 |
+
"git_hash": "928c7657"
|
250 |
+
}
|
data/raw-eval-outputs/microsoft-Phi-3-medium-4k-instruct_results.json
ADDED
@@ -0,0 +1,316 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"b4b": {
|
4 |
+
"acc,none": 0.6593711467324291,
|
5 |
+
"acc_stderr,none": 0.05882406104148581,
|
6 |
+
"acc_norm,none": 0.6593711467324291,
|
7 |
+
"acc_norm_stderr,none": 0.05882406104148581,
|
8 |
+
"alias": "b4b"
|
9 |
+
},
|
10 |
+
"b4bqa": {
|
11 |
+
"acc,none": 0.6997767857142857,
|
12 |
+
"acc_stderr,none": 0.010830639682891873,
|
13 |
+
"acc_norm,none": 0.6997767857142857,
|
14 |
+
"acc_norm_stderr,none": 0.010830639682891873,
|
15 |
+
"alias": " - b4bqa"
|
16 |
+
},
|
17 |
+
"medmcqa_g2b": {
|
18 |
+
"acc,none": 0.603448275862069,
|
19 |
+
"acc_stderr,none": 0.026260634141933786,
|
20 |
+
"acc_norm,none": 0.603448275862069,
|
21 |
+
"acc_norm_stderr,none": 0.026260634141933786,
|
22 |
+
"alias": " - medmcqa_g2b"
|
23 |
+
},
|
24 |
+
"medmcqa_orig_filtered": {
|
25 |
+
"acc,none": 0.7241379310344828,
|
26 |
+
"acc_stderr,none": 0.023993406146998367,
|
27 |
+
"acc_norm,none": 0.7241379310344828,
|
28 |
+
"acc_norm_stderr,none": 0.023993406146998367,
|
29 |
+
"alias": " - medmcqa_orig_filtered"
|
30 |
+
},
|
31 |
+
"medqa_4options_g2b": {
|
32 |
+
"acc,none": 0.5343915343915344,
|
33 |
+
"acc_stderr,none": 0.025690321762493848,
|
34 |
+
"acc_norm,none": 0.5343915343915344,
|
35 |
+
"acc_norm_stderr,none": 0.025690321762493848,
|
36 |
+
"alias": " - medqa_4options_g2b"
|
37 |
+
},
|
38 |
+
"medqa_4options_orig_filtered": {
|
39 |
+
"acc,none": 0.5846560846560847,
|
40 |
+
"acc_stderr,none": 0.025379524910778398,
|
41 |
+
"acc_norm,none": 0.5846560846560847,
|
42 |
+
"acc_norm_stderr,none": 0.025379524910778398,
|
43 |
+
"alias": " - medqa_4options_orig_filtered"
|
44 |
+
}
|
45 |
+
},
|
46 |
+
"groups": {
|
47 |
+
"b4b": {
|
48 |
+
"acc,none": 0.6593711467324291,
|
49 |
+
"acc_stderr,none": 0.05882406104148581,
|
50 |
+
"acc_norm,none": 0.6593711467324291,
|
51 |
+
"acc_norm_stderr,none": 0.05882406104148581,
|
52 |
+
"alias": "b4b"
|
53 |
+
}
|
54 |
+
},
|
55 |
+
"configs": {
|
56 |
+
"b4bqa": {
|
57 |
+
"task": "b4bqa",
|
58 |
+
"dataset_path": "AIM-Harvard/b4b_drug_qa",
|
59 |
+
"test_split": "test",
|
60 |
+
"doc_to_text": "<function process_cd at 0x7f872445dee0>",
|
61 |
+
"doc_to_target": "correct_choice",
|
62 |
+
"doc_to_choice": [
|
63 |
+
"A",
|
64 |
+
"B",
|
65 |
+
"C",
|
66 |
+
"D"
|
67 |
+
],
|
68 |
+
"description": "",
|
69 |
+
"target_delimiter": " ",
|
70 |
+
"fewshot_delimiter": "\n\n",
|
71 |
+
"metric_list": [
|
72 |
+
{
|
73 |
+
"metric": "acc",
|
74 |
+
"aggregation": "mean",
|
75 |
+
"higher_is_better": true
|
76 |
+
},
|
77 |
+
{
|
78 |
+
"metric": "acc_norm",
|
79 |
+
"aggregation": "mean",
|
80 |
+
"higher_is_better": true
|
81 |
+
}
|
82 |
+
],
|
83 |
+
"output_type": "multiple_choice",
|
84 |
+
"repeats": 1,
|
85 |
+
"should_decontaminate": false
|
86 |
+
},
|
87 |
+
"medmcqa_g2b": {
|
88 |
+
"task": "medmcqa_g2b",
|
89 |
+
"dataset_path": "AIM-Harvard/medmcqa_generic_to_brand",
|
90 |
+
"training_split": "train",
|
91 |
+
"validation_split": "validation",
|
92 |
+
"test_split": "validation",
|
93 |
+
"doc_to_text": "<function doc_to_text at 0x7f87249823a0>",
|
94 |
+
"doc_to_target": "cop",
|
95 |
+
"doc_to_choice": [
|
96 |
+
"A",
|
97 |
+
"B",
|
98 |
+
"C",
|
99 |
+
"D"
|
100 |
+
],
|
101 |
+
"description": "",
|
102 |
+
"target_delimiter": " ",
|
103 |
+
"fewshot_delimiter": "\n\n",
|
104 |
+
"metric_list": [
|
105 |
+
{
|
106 |
+
"metric": "acc",
|
107 |
+
"aggregation": "mean",
|
108 |
+
"higher_is_better": true
|
109 |
+
},
|
110 |
+
{
|
111 |
+
"metric": "acc_norm",
|
112 |
+
"aggregation": "mean",
|
113 |
+
"higher_is_better": true
|
114 |
+
}
|
115 |
+
],
|
116 |
+
"output_type": "multiple_choice",
|
117 |
+
"repeats": 1,
|
118 |
+
"should_decontaminate": true,
|
119 |
+
"doc_to_decontamination_query": "{{question}}"
|
120 |
+
},
|
121 |
+
"medmcqa_orig_filtered": {
|
122 |
+
"task": "medmcqa_orig_filtered",
|
123 |
+
"dataset_path": "AIM-Harvard/medmcqa_original",
|
124 |
+
"training_split": "train",
|
125 |
+
"validation_split": "validation",
|
126 |
+
"test_split": "validation",
|
127 |
+
"doc_to_text": "<function doc_to_text at 0x7f87242cb310>",
|
128 |
+
"doc_to_target": "cop",
|
129 |
+
"doc_to_choice": [
|
130 |
+
"A",
|
131 |
+
"B",
|
132 |
+
"C",
|
133 |
+
"D"
|
134 |
+
],
|
135 |
+
"description": "",
|
136 |
+
"target_delimiter": " ",
|
137 |
+
"fewshot_delimiter": "\n\n",
|
138 |
+
"metric_list": [
|
139 |
+
{
|
140 |
+
"metric": "acc",
|
141 |
+
"aggregation": "mean",
|
142 |
+
"higher_is_better": true
|
143 |
+
},
|
144 |
+
{
|
145 |
+
"metric": "acc_norm",
|
146 |
+
"aggregation": "mean",
|
147 |
+
"higher_is_better": true
|
148 |
+
}
|
149 |
+
],
|
150 |
+
"output_type": "multiple_choice",
|
151 |
+
"repeats": 1,
|
152 |
+
"should_decontaminate": true,
|
153 |
+
"doc_to_decontamination_query": "{{question}}"
|
154 |
+
},
|
155 |
+
"medqa_4options_g2b": {
|
156 |
+
"task": "medqa_4options_g2b",
|
157 |
+
"dataset_path": "AIM-Harvard/gbaker_medqa_usmle_4_options_hf_generic_to_brand",
|
158 |
+
"training_split": "train",
|
159 |
+
"validation_split": "validation",
|
160 |
+
"test_split": "test",
|
161 |
+
"doc_to_text": "<function doc_to_text at 0x7f8724982820>",
|
162 |
+
"doc_to_target": "<function doc_to_target at 0x7f8724982b80>",
|
163 |
+
"doc_to_choice": [
|
164 |
+
"A",
|
165 |
+
"B",
|
166 |
+
"C",
|
167 |
+
"D"
|
168 |
+
],
|
169 |
+
"description": "",
|
170 |
+
"target_delimiter": " ",
|
171 |
+
"fewshot_delimiter": "\n\n",
|
172 |
+
"metric_list": [
|
173 |
+
{
|
174 |
+
"metric": "acc",
|
175 |
+
"aggregation": "mean",
|
176 |
+
"higher_is_better": true
|
177 |
+
},
|
178 |
+
{
|
179 |
+
"metric": "acc_norm",
|
180 |
+
"aggregation": "mean",
|
181 |
+
"higher_is_better": true
|
182 |
+
}
|
183 |
+
],
|
184 |
+
"output_type": "multiple_choice",
|
185 |
+
"repeats": 1,
|
186 |
+
"should_decontaminate": false
|
187 |
+
},
|
188 |
+
"medqa_4options_orig_filtered": {
|
189 |
+
"task": "medqa_4options_orig_filtered",
|
190 |
+
"dataset_path": "AIM-Harvard/gbaker_medqa_usmle_4_options_hf_original",
|
191 |
+
"training_split": "train",
|
192 |
+
"validation_split": "validation",
|
193 |
+
"test_split": "test",
|
194 |
+
"doc_to_text": "<function doc_to_text at 0x7f872447f4c0>",
|
195 |
+
"doc_to_target": "<function doc_to_target at 0x7f872442cee0>",
|
196 |
+
"doc_to_choice": [
|
197 |
+
"A",
|
198 |
+
"B",
|
199 |
+
"C",
|
200 |
+
"D"
|
201 |
+
],
|
202 |
+
"description": "",
|
203 |
+
"target_delimiter": " ",
|
204 |
+
"fewshot_delimiter": "\n\n",
|
205 |
+
"metric_list": [
|
206 |
+
{
|
207 |
+
"metric": "acc",
|
208 |
+
"aggregation": "mean",
|
209 |
+
"higher_is_better": true
|
210 |
+
},
|
211 |
+
{
|
212 |
+
"metric": "acc_norm",
|
213 |
+
"aggregation": "mean",
|
214 |
+
"higher_is_better": true
|
215 |
+
}
|
216 |
+
],
|
217 |
+
"output_type": "multiple_choice",
|
218 |
+
"repeats": 1,
|
219 |
+
"should_decontaminate": false
|
220 |
+
}
|
221 |
+
},
|
222 |
+
"versions": {
|
223 |
+
"b4b": "N/A",
|
224 |
+
"b4bqa": "Yaml",
|
225 |
+
"medmcqa_g2b": "Yaml",
|
226 |
+
"medmcqa_orig_filtered": "Yaml",
|
227 |
+
"medqa_4options_g2b": "Yaml",
|
228 |
+
"medqa_4options_orig_filtered": "Yaml"
|
229 |
+
},
|
230 |
+
"n-shot": {
|
231 |
+
"b4b": 0,
|
232 |
+
"b4bqa": 0,
|
233 |
+
"medmcqa_g2b": 0,
|
234 |
+
"medmcqa_orig_filtered": 0,
|
235 |
+
"medqa_4options_g2b": 0,
|
236 |
+
"medqa_4options_orig_filtered": 0
|
237 |
+
},
|
238 |
+
"config": {
|
239 |
+
"model": "hf",
|
240 |
+
"model_args": "pretrained=microsoft/Phi-3-medium-4k-instruct,load_in_4bit=True",
|
241 |
+
"batch_size": "auto:64",
|
242 |
+
"batch_sizes": [
|
243 |
+
8,
|
244 |
+
16,
|
245 |
+
32,
|
246 |
+
32,
|
247 |
+
32,
|
248 |
+
32,
|
249 |
+
32,
|
250 |
+
32,
|
251 |
+
32,
|
252 |
+
32,
|
253 |
+
32,
|
254 |
+
32,
|
255 |
+
64,
|
256 |
+
64,
|
257 |
+
64,
|
258 |
+
64,
|
259 |
+
64,
|
260 |
+
64,
|
261 |
+
64,
|
262 |
+
64,
|
263 |
+
64,
|
264 |
+
64,
|
265 |
+
64,
|
266 |
+
64,
|
267 |
+
64,
|
268 |
+
64,
|
269 |
+
64,
|
270 |
+
64,
|
271 |
+
64,
|
272 |
+
64,
|
273 |
+
64,
|
274 |
+
64,
|
275 |
+
64,
|
276 |
+
64,
|
277 |
+
64,
|
278 |
+
64,
|
279 |
+
64,
|
280 |
+
64,
|
281 |
+
64,
|
282 |
+
64,
|
283 |
+
64,
|
284 |
+
64,
|
285 |
+
64,
|
286 |
+
64,
|
287 |
+
64,
|
288 |
+
64,
|
289 |
+
64,
|
290 |
+
64,
|
291 |
+
64,
|
292 |
+
64,
|
293 |
+
64,
|
294 |
+
64,
|
295 |
+
64,
|
296 |
+
64,
|
297 |
+
64,
|
298 |
+
64,
|
299 |
+
64,
|
300 |
+
64,
|
301 |
+
64,
|
302 |
+
64,
|
303 |
+
64,
|
304 |
+
64,
|
305 |
+
64,
|
306 |
+
64,
|
307 |
+
64
|
308 |
+
],
|
309 |
+
"device": "cuda:0",
|
310 |
+
"use_cache": null,
|
311 |
+
"limit": null,
|
312 |
+
"bootstrap_iters": 100000,
|
313 |
+
"gen_kwargs": null
|
314 |
+
},
|
315 |
+
"git_hash": "928c7657"
|
316 |
+
}
|
data/raw-eval-outputs/microsoft-phi-1_5_results.json
ADDED
@@ -0,0 +1,316 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"b4b": {
|
4 |
+
"acc,none": 0.3024044389642417,
|
5 |
+
"acc_stderr,none": 0.030335029823792846,
|
6 |
+
"acc_norm,none": 0.3024044389642417,
|
7 |
+
"acc_norm_stderr,none": 0.030335029823792846,
|
8 |
+
"alias": "b4b"
|
9 |
+
},
|
10 |
+
"b4bqa": {
|
11 |
+
"acc,none": 0.28013392857142855,
|
12 |
+
"acc_stderr,none": 0.010611112414051155,
|
13 |
+
"acc_norm,none": 0.28013392857142855,
|
14 |
+
"acc_norm_stderr,none": 0.010611112414051155,
|
15 |
+
"alias": " - b4bqa"
|
16 |
+
},
|
17 |
+
"medmcqa_g2b": {
|
18 |
+
"acc,none": 0.3160919540229885,
|
19 |
+
"acc_stderr,none": 0.024959784982131285,
|
20 |
+
"acc_norm,none": 0.3160919540229885,
|
21 |
+
"acc_norm_stderr,none": 0.024959784982131285,
|
22 |
+
"alias": " - medmcqa_g2b"
|
23 |
+
},
|
24 |
+
"medmcqa_orig_filtered": {
|
25 |
+
"acc,none": 0.3045977011494253,
|
26 |
+
"acc_stderr,none": 0.024706807658616183,
|
27 |
+
"acc_norm,none": 0.3045977011494253,
|
28 |
+
"acc_norm_stderr,none": 0.024706807658616183,
|
29 |
+
"alias": " - medmcqa_orig_filtered"
|
30 |
+
},
|
31 |
+
"medqa_4options_g2b": {
|
32 |
+
"acc,none": 0.3492063492063492,
|
33 |
+
"acc_stderr,none": 0.024552292209342654,
|
34 |
+
"acc_norm,none": 0.3492063492063492,
|
35 |
+
"acc_norm_stderr,none": 0.024552292209342654,
|
36 |
+
"alias": " - medqa_4options_g2b"
|
37 |
+
},
|
38 |
+
"medqa_4options_orig_filtered": {
|
39 |
+
"acc,none": 0.34656084656084657,
|
40 |
+
"acc_stderr,none": 0.024508777521028435,
|
41 |
+
"acc_norm,none": 0.34656084656084657,
|
42 |
+
"acc_norm_stderr,none": 0.024508777521028435,
|
43 |
+
"alias": " - medqa_4options_orig_filtered"
|
44 |
+
}
|
45 |
+
},
|
46 |
+
"groups": {
|
47 |
+
"b4b": {
|
48 |
+
"acc,none": 0.3024044389642417,
|
49 |
+
"acc_stderr,none": 0.030335029823792846,
|
50 |
+
"acc_norm,none": 0.3024044389642417,
|
51 |
+
"acc_norm_stderr,none": 0.030335029823792846,
|
52 |
+
"alias": "b4b"
|
53 |
+
}
|
54 |
+
},
|
55 |
+
"configs": {
|
56 |
+
"b4bqa": {
|
57 |
+
"task": "b4bqa",
|
58 |
+
"dataset_path": "AIM-Harvard/b4b_drug_qa",
|
59 |
+
"test_split": "test",
|
60 |
+
"doc_to_text": "<function process_cd at 0x7f202b05ff70>",
|
61 |
+
"doc_to_target": "correct_choice",
|
62 |
+
"doc_to_choice": [
|
63 |
+
"A",
|
64 |
+
"B",
|
65 |
+
"C",
|
66 |
+
"D"
|
67 |
+
],
|
68 |
+
"description": "",
|
69 |
+
"target_delimiter": " ",
|
70 |
+
"fewshot_delimiter": "\n\n",
|
71 |
+
"metric_list": [
|
72 |
+
{
|
73 |
+
"metric": "acc",
|
74 |
+
"aggregation": "mean",
|
75 |
+
"higher_is_better": true
|
76 |
+
},
|
77 |
+
{
|
78 |
+
"metric": "acc_norm",
|
79 |
+
"aggregation": "mean",
|
80 |
+
"higher_is_better": true
|
81 |
+
}
|
82 |
+
],
|
83 |
+
"output_type": "multiple_choice",
|
84 |
+
"repeats": 1,
|
85 |
+
"should_decontaminate": false
|
86 |
+
},
|
87 |
+
"medmcqa_g2b": {
|
88 |
+
"task": "medmcqa_g2b",
|
89 |
+
"dataset_path": "AIM-Harvard/medmcqa_generic_to_brand",
|
90 |
+
"training_split": "train",
|
91 |
+
"validation_split": "validation",
|
92 |
+
"test_split": "validation",
|
93 |
+
"doc_to_text": "<function doc_to_text at 0x7f202b59e430>",
|
94 |
+
"doc_to_target": "cop",
|
95 |
+
"doc_to_choice": [
|
96 |
+
"A",
|
97 |
+
"B",
|
98 |
+
"C",
|
99 |
+
"D"
|
100 |
+
],
|
101 |
+
"description": "",
|
102 |
+
"target_delimiter": " ",
|
103 |
+
"fewshot_delimiter": "\n\n",
|
104 |
+
"metric_list": [
|
105 |
+
{
|
106 |
+
"metric": "acc",
|
107 |
+
"aggregation": "mean",
|
108 |
+
"higher_is_better": true
|
109 |
+
},
|
110 |
+
{
|
111 |
+
"metric": "acc_norm",
|
112 |
+
"aggregation": "mean",
|
113 |
+
"higher_is_better": true
|
114 |
+
}
|
115 |
+
],
|
116 |
+
"output_type": "multiple_choice",
|
117 |
+
"repeats": 1,
|
118 |
+
"should_decontaminate": true,
|
119 |
+
"doc_to_decontamination_query": "{{question}}"
|
120 |
+
},
|
121 |
+
"medmcqa_orig_filtered": {
|
122 |
+
"task": "medmcqa_orig_filtered",
|
123 |
+
"dataset_path": "AIM-Harvard/medmcqa_original",
|
124 |
+
"training_split": "train",
|
125 |
+
"validation_split": "validation",
|
126 |
+
"test_split": "validation",
|
127 |
+
"doc_to_text": "<function doc_to_text at 0x7f202aecb3a0>",
|
128 |
+
"doc_to_target": "cop",
|
129 |
+
"doc_to_choice": [
|
130 |
+
"A",
|
131 |
+
"B",
|
132 |
+
"C",
|
133 |
+
"D"
|
134 |
+
],
|
135 |
+
"description": "",
|
136 |
+
"target_delimiter": " ",
|
137 |
+
"fewshot_delimiter": "\n\n",
|
138 |
+
"metric_list": [
|
139 |
+
{
|
140 |
+
"metric": "acc",
|
141 |
+
"aggregation": "mean",
|
142 |
+
"higher_is_better": true
|
143 |
+
},
|
144 |
+
{
|
145 |
+
"metric": "acc_norm",
|
146 |
+
"aggregation": "mean",
|
147 |
+
"higher_is_better": true
|
148 |
+
}
|
149 |
+
],
|
150 |
+
"output_type": "multiple_choice",
|
151 |
+
"repeats": 1,
|
152 |
+
"should_decontaminate": true,
|
153 |
+
"doc_to_decontamination_query": "{{question}}"
|
154 |
+
},
|
155 |
+
"medqa_4options_g2b": {
|
156 |
+
"task": "medqa_4options_g2b",
|
157 |
+
"dataset_path": "AIM-Harvard/gbaker_medqa_usmle_4_options_hf_generic_to_brand",
|
158 |
+
"training_split": "train",
|
159 |
+
"validation_split": "validation",
|
160 |
+
"test_split": "test",
|
161 |
+
"doc_to_text": "<function doc_to_text at 0x7f202b59e8b0>",
|
162 |
+
"doc_to_target": "<function doc_to_target at 0x7f202b59ec10>",
|
163 |
+
"doc_to_choice": [
|
164 |
+
"A",
|
165 |
+
"B",
|
166 |
+
"C",
|
167 |
+
"D"
|
168 |
+
],
|
169 |
+
"description": "",
|
170 |
+
"target_delimiter": " ",
|
171 |
+
"fewshot_delimiter": "\n\n",
|
172 |
+
"metric_list": [
|
173 |
+
{
|
174 |
+
"metric": "acc",
|
175 |
+
"aggregation": "mean",
|
176 |
+
"higher_is_better": true
|
177 |
+
},
|
178 |
+
{
|
179 |
+
"metric": "acc_norm",
|
180 |
+
"aggregation": "mean",
|
181 |
+
"higher_is_better": true
|
182 |
+
}
|
183 |
+
],
|
184 |
+
"output_type": "multiple_choice",
|
185 |
+
"repeats": 1,
|
186 |
+
"should_decontaminate": false
|
187 |
+
},
|
188 |
+
"medqa_4options_orig_filtered": {
|
189 |
+
"task": "medqa_4options_orig_filtered",
|
190 |
+
"dataset_path": "AIM-Harvard/gbaker_medqa_usmle_4_options_hf_original",
|
191 |
+
"training_split": "train",
|
192 |
+
"validation_split": "validation",
|
193 |
+
"test_split": "test",
|
194 |
+
"doc_to_text": "<function doc_to_text at 0x7f202b084550>",
|
195 |
+
"doc_to_target": "<function doc_to_target at 0x7f202b030f70>",
|
196 |
+
"doc_to_choice": [
|
197 |
+
"A",
|
198 |
+
"B",
|
199 |
+
"C",
|
200 |
+
"D"
|
201 |
+
],
|
202 |
+
"description": "",
|
203 |
+
"target_delimiter": " ",
|
204 |
+
"fewshot_delimiter": "\n\n",
|
205 |
+
"metric_list": [
|
206 |
+
{
|
207 |
+
"metric": "acc",
|
208 |
+
"aggregation": "mean",
|
209 |
+
"higher_is_better": true
|
210 |
+
},
|
211 |
+
{
|
212 |
+
"metric": "acc_norm",
|
213 |
+
"aggregation": "mean",
|
214 |
+
"higher_is_better": true
|
215 |
+
}
|
216 |
+
],
|
217 |
+
"output_type": "multiple_choice",
|
218 |
+
"repeats": 1,
|
219 |
+
"should_decontaminate": false
|
220 |
+
}
|
221 |
+
},
|
222 |
+
"versions": {
|
223 |
+
"b4b": "N/A",
|
224 |
+
"b4bqa": "Yaml",
|
225 |
+
"medmcqa_g2b": "Yaml",
|
226 |
+
"medmcqa_orig_filtered": "Yaml",
|
227 |
+
"medqa_4options_g2b": "Yaml",
|
228 |
+
"medqa_4options_orig_filtered": "Yaml"
|
229 |
+
},
|
230 |
+
"n-shot": {
|
231 |
+
"b4b": 0,
|
232 |
+
"b4bqa": 0,
|
233 |
+
"medmcqa_g2b": 0,
|
234 |
+
"medmcqa_orig_filtered": 0,
|
235 |
+
"medqa_4options_g2b": 0,
|
236 |
+
"medqa_4options_orig_filtered": 0
|
237 |
+
},
|
238 |
+
"config": {
|
239 |
+
"model": "hf",
|
240 |
+
"model_args": "pretrained=microsoft/phi-1_5,load_in_4bit=True",
|
241 |
+
"batch_size": "auto:64",
|
242 |
+
"batch_sizes": [
|
243 |
+
32,
|
244 |
+
64,
|
245 |
+
64,
|
246 |
+
64,
|
247 |
+
64,
|
248 |
+
64,
|
249 |
+
64,
|
250 |
+
64,
|
251 |
+
64,
|
252 |
+
64,
|
253 |
+
64,
|
254 |
+
64,
|
255 |
+
64,
|
256 |
+
64,
|
257 |
+
64,
|
258 |
+
64,
|
259 |
+
64,
|
260 |
+
64,
|
261 |
+
64,
|
262 |
+
64,
|
263 |
+
64,
|
264 |
+
64,
|
265 |
+
64,
|
266 |
+
64,
|
267 |
+
64,
|
268 |
+
64,
|
269 |
+
64,
|
270 |
+
64,
|
271 |
+
64,
|
272 |
+
64,
|
273 |
+
64,
|
274 |
+
64,
|
275 |
+
64,
|
276 |
+
64,
|
277 |
+
64,
|
278 |
+
64,
|
279 |
+
64,
|
280 |
+
64,
|
281 |
+
64,
|
282 |
+
64,
|
283 |
+
64,
|
284 |
+
64,
|
285 |
+
64,
|
286 |
+
64,
|
287 |
+
64,
|
288 |
+
64,
|
289 |
+
64,
|
290 |
+
64,
|
291 |
+
64,
|
292 |
+
64,
|
293 |
+
64,
|
294 |
+
64,
|
295 |
+
64,
|
296 |
+
64,
|
297 |
+
64,
|
298 |
+
64,
|
299 |
+
64,
|
300 |
+
64,
|
301 |
+
64,
|
302 |
+
64,
|
303 |
+
64,
|
304 |
+
64,
|
305 |
+
64,
|
306 |
+
64,
|
307 |
+
64
|
308 |
+
],
|
309 |
+
"device": "cuda:0",
|
310 |
+
"use_cache": null,
|
311 |
+
"limit": null,
|
312 |
+
"bootstrap_iters": 100000,
|
313 |
+
"gen_kwargs": null
|
314 |
+
},
|
315 |
+
"git_hash": "928c7657"
|
316 |
+
}
|
data/raw-eval-outputs/microsoft-phi-1_results.json
ADDED
@@ -0,0 +1,316 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"b4b": {
|
4 |
+
"acc,none": 0.21177558569667077,
|
5 |
+
"acc_stderr,none": 0.024570863489409633,
|
6 |
+
"acc_norm,none": 0.21177558569667077,
|
7 |
+
"acc_norm_stderr,none": 0.024570863489409633,
|
8 |
+
"alias": "b4b"
|
9 |
+
},
|
10 |
+
"b4bqa": {
|
11 |
+
"acc,none": 0.19642857142857142,
|
12 |
+
"acc_stderr,none": 0.009387863785916705,
|
13 |
+
"acc_norm,none": 0.19642857142857142,
|
14 |
+
"acc_norm_stderr,none": 0.009387863785916705,
|
15 |
+
"alias": " - b4bqa"
|
16 |
+
},
|
17 |
+
"medmcqa_g2b": {
|
18 |
+
"acc,none": 0.2413793103448276,
|
19 |
+
"acc_stderr,none": 0.02297193745254371,
|
20 |
+
"acc_norm,none": 0.2413793103448276,
|
21 |
+
"acc_norm_stderr,none": 0.02297193745254371,
|
22 |
+
"alias": " - medmcqa_g2b"
|
23 |
+
},
|
24 |
+
"medmcqa_orig_filtered": {
|
25 |
+
"acc,none": 0.25862068965517243,
|
26 |
+
"acc_stderr,none": 0.023506454355379604,
|
27 |
+
"acc_norm,none": 0.25862068965517243,
|
28 |
+
"acc_norm_stderr,none": 0.023506454355379604,
|
29 |
+
"alias": " - medmcqa_orig_filtered"
|
30 |
+
},
|
31 |
+
"medqa_4options_g2b": {
|
32 |
+
"acc,none": 0.21693121693121692,
|
33 |
+
"acc_stderr,none": 0.02122708244944506,
|
34 |
+
"acc_norm,none": 0.21693121693121692,
|
35 |
+
"acc_norm_stderr,none": 0.02122708244944506,
|
36 |
+
"alias": " - medqa_4options_g2b"
|
37 |
+
},
|
38 |
+
"medqa_4options_orig_filtered": {
|
39 |
+
"acc,none": 0.20899470899470898,
|
40 |
+
"acc_stderr,none": 0.02094048156533485,
|
41 |
+
"acc_norm,none": 0.20899470899470898,
|
42 |
+
"acc_norm_stderr,none": 0.02094048156533485,
|
43 |
+
"alias": " - medqa_4options_orig_filtered"
|
44 |
+
}
|
45 |
+
},
|
46 |
+
"groups": {
|
47 |
+
"b4b": {
|
48 |
+
"acc,none": 0.21177558569667077,
|
49 |
+
"acc_stderr,none": 0.024570863489409633,
|
50 |
+
"acc_norm,none": 0.21177558569667077,
|
51 |
+
"acc_norm_stderr,none": 0.024570863489409633,
|
52 |
+
"alias": "b4b"
|
53 |
+
}
|
54 |
+
},
|
55 |
+
"configs": {
|
56 |
+
"b4bqa": {
|
57 |
+
"task": "b4bqa",
|
58 |
+
"dataset_path": "AIM-Harvard/b4b_drug_qa",
|
59 |
+
"test_split": "test",
|
60 |
+
"doc_to_text": "<function process_cd at 0x7f3613c5fee0>",
|
61 |
+
"doc_to_target": "correct_choice",
|
62 |
+
"doc_to_choice": [
|
63 |
+
"A",
|
64 |
+
"B",
|
65 |
+
"C",
|
66 |
+
"D"
|
67 |
+
],
|
68 |
+
"description": "",
|
69 |
+
"target_delimiter": " ",
|
70 |
+
"fewshot_delimiter": "\n\n",
|
71 |
+
"metric_list": [
|
72 |
+
{
|
73 |
+
"metric": "acc",
|
74 |
+
"aggregation": "mean",
|
75 |
+
"higher_is_better": true
|
76 |
+
},
|
77 |
+
{
|
78 |
+
"metric": "acc_norm",
|
79 |
+
"aggregation": "mean",
|
80 |
+
"higher_is_better": true
|
81 |
+
}
|
82 |
+
],
|
83 |
+
"output_type": "multiple_choice",
|
84 |
+
"repeats": 1,
|
85 |
+
"should_decontaminate": false
|
86 |
+
},
|
87 |
+
"medmcqa_g2b": {
|
88 |
+
"task": "medmcqa_g2b",
|
89 |
+
"dataset_path": "AIM-Harvard/medmcqa_generic_to_brand",
|
90 |
+
"training_split": "train",
|
91 |
+
"validation_split": "validation",
|
92 |
+
"test_split": "validation",
|
93 |
+
"doc_to_text": "<function doc_to_text at 0x7f36141953a0>",
|
94 |
+
"doc_to_target": "cop",
|
95 |
+
"doc_to_choice": [
|
96 |
+
"A",
|
97 |
+
"B",
|
98 |
+
"C",
|
99 |
+
"D"
|
100 |
+
],
|
101 |
+
"description": "",
|
102 |
+
"target_delimiter": " ",
|
103 |
+
"fewshot_delimiter": "\n\n",
|
104 |
+
"metric_list": [
|
105 |
+
{
|
106 |
+
"metric": "acc",
|
107 |
+
"aggregation": "mean",
|
108 |
+
"higher_is_better": true
|
109 |
+
},
|
110 |
+
{
|
111 |
+
"metric": "acc_norm",
|
112 |
+
"aggregation": "mean",
|
113 |
+
"higher_is_better": true
|
114 |
+
}
|
115 |
+
],
|
116 |
+
"output_type": "multiple_choice",
|
117 |
+
"repeats": 1,
|
118 |
+
"should_decontaminate": true,
|
119 |
+
"doc_to_decontamination_query": "{{question}}"
|
120 |
+
},
|
121 |
+
"medmcqa_orig_filtered": {
|
122 |
+
"task": "medmcqa_orig_filtered",
|
123 |
+
"dataset_path": "AIM-Harvard/medmcqa_original",
|
124 |
+
"training_split": "train",
|
125 |
+
"validation_split": "validation",
|
126 |
+
"test_split": "validation",
|
127 |
+
"doc_to_text": "<function doc_to_text at 0x7f3613acb310>",
|
128 |
+
"doc_to_target": "cop",
|
129 |
+
"doc_to_choice": [
|
130 |
+
"A",
|
131 |
+
"B",
|
132 |
+
"C",
|
133 |
+
"D"
|
134 |
+
],
|
135 |
+
"description": "",
|
136 |
+
"target_delimiter": " ",
|
137 |
+
"fewshot_delimiter": "\n\n",
|
138 |
+
"metric_list": [
|
139 |
+
{
|
140 |
+
"metric": "acc",
|
141 |
+
"aggregation": "mean",
|
142 |
+
"higher_is_better": true
|
143 |
+
},
|
144 |
+
{
|
145 |
+
"metric": "acc_norm",
|
146 |
+
"aggregation": "mean",
|
147 |
+
"higher_is_better": true
|
148 |
+
}
|
149 |
+
],
|
150 |
+
"output_type": "multiple_choice",
|
151 |
+
"repeats": 1,
|
152 |
+
"should_decontaminate": true,
|
153 |
+
"doc_to_decontamination_query": "{{question}}"
|
154 |
+
},
|
155 |
+
"medqa_4options_g2b": {
|
156 |
+
"task": "medqa_4options_g2b",
|
157 |
+
"dataset_path": "AIM-Harvard/gbaker_medqa_usmle_4_options_hf_generic_to_brand",
|
158 |
+
"training_split": "train",
|
159 |
+
"validation_split": "validation",
|
160 |
+
"test_split": "test",
|
161 |
+
"doc_to_text": "<function doc_to_text at 0x7f3614195820>",
|
162 |
+
"doc_to_target": "<function doc_to_target at 0x7f3614195b80>",
|
163 |
+
"doc_to_choice": [
|
164 |
+
"A",
|
165 |
+
"B",
|
166 |
+
"C",
|
167 |
+
"D"
|
168 |
+
],
|
169 |
+
"description": "",
|
170 |
+
"target_delimiter": " ",
|
171 |
+
"fewshot_delimiter": "\n\n",
|
172 |
+
"metric_list": [
|
173 |
+
{
|
174 |
+
"metric": "acc",
|
175 |
+
"aggregation": "mean",
|
176 |
+
"higher_is_better": true
|
177 |
+
},
|
178 |
+
{
|
179 |
+
"metric": "acc_norm",
|
180 |
+
"aggregation": "mean",
|
181 |
+
"higher_is_better": true
|
182 |
+
}
|
183 |
+
],
|
184 |
+
"output_type": "multiple_choice",
|
185 |
+
"repeats": 1,
|
186 |
+
"should_decontaminate": false
|
187 |
+
},
|
188 |
+
"medqa_4options_orig_filtered": {
|
189 |
+
"task": "medqa_4options_orig_filtered",
|
190 |
+
"dataset_path": "AIM-Harvard/gbaker_medqa_usmle_4_options_hf_original",
|
191 |
+
"training_split": "train",
|
192 |
+
"validation_split": "validation",
|
193 |
+
"test_split": "test",
|
194 |
+
"doc_to_text": "<function doc_to_text at 0x7f3613c814c0>",
|
195 |
+
"doc_to_target": "<function doc_to_target at 0x7f3613c30ee0>",
|
196 |
+
"doc_to_choice": [
|
197 |
+
"A",
|
198 |
+
"B",
|
199 |
+
"C",
|
200 |
+
"D"
|
201 |
+
],
|
202 |
+
"description": "",
|
203 |
+
"target_delimiter": " ",
|
204 |
+
"fewshot_delimiter": "\n\n",
|
205 |
+
"metric_list": [
|
206 |
+
{
|
207 |
+
"metric": "acc",
|
208 |
+
"aggregation": "mean",
|
209 |
+
"higher_is_better": true
|
210 |
+
},
|
211 |
+
{
|
212 |
+
"metric": "acc_norm",
|
213 |
+
"aggregation": "mean",
|
214 |
+
"higher_is_better": true
|
215 |
+
}
|
216 |
+
],
|
217 |
+
"output_type": "multiple_choice",
|
218 |
+
"repeats": 1,
|
219 |
+
"should_decontaminate": false
|
220 |
+
}
|
221 |
+
},
|
222 |
+
"versions": {
|
223 |
+
"b4b": "N/A",
|
224 |
+
"b4bqa": "Yaml",
|
225 |
+
"medmcqa_g2b": "Yaml",
|
226 |
+
"medmcqa_orig_filtered": "Yaml",
|
227 |
+
"medqa_4options_g2b": "Yaml",
|
228 |
+
"medqa_4options_orig_filtered": "Yaml"
|
229 |
+
},
|
230 |
+
"n-shot": {
|
231 |
+
"b4b": 0,
|
232 |
+
"b4bqa": 0,
|
233 |
+
"medmcqa_g2b": 0,
|
234 |
+
"medmcqa_orig_filtered": 0,
|
235 |
+
"medqa_4options_g2b": 0,
|
236 |
+
"medqa_4options_orig_filtered": 0
|
237 |
+
},
|
238 |
+
"config": {
|
239 |
+
"model": "hf",
|
240 |
+
"model_args": "pretrained=microsoft/phi-1,load_in_4bit=True",
|
241 |
+
"batch_size": "auto:64",
|
242 |
+
"batch_sizes": [
|
243 |
+
32,
|
244 |
+
64,
|
245 |
+
64,
|
246 |
+
64,
|
247 |
+
64,
|
248 |
+
64,
|
249 |
+
64,
|
250 |
+
64,
|
251 |
+
64,
|
252 |
+
64,
|
253 |
+
64,
|
254 |
+
64,
|
255 |
+
64,
|
256 |
+
64,
|
257 |
+
64,
|
258 |
+
64,
|
259 |
+
64,
|
260 |
+
64,
|
261 |
+
64,
|
262 |
+
64,
|
263 |
+
64,
|
264 |
+
64,
|
265 |
+
64,
|
266 |
+
64,
|
267 |
+
64,
|
268 |
+
64,
|
269 |
+
64,
|
270 |
+
64,
|
271 |
+
64,
|
272 |
+
64,
|
273 |
+
64,
|
274 |
+
64,
|
275 |
+
64,
|
276 |
+
64,
|
277 |
+
64,
|
278 |
+
64,
|
279 |
+
64,
|
280 |
+
64,
|
281 |
+
64,
|
282 |
+
64,
|
283 |
+
64,
|
284 |
+
64,
|
285 |
+
64,
|
286 |
+
64,
|
287 |
+
64,
|
288 |
+
64,
|
289 |
+
64,
|
290 |
+
64,
|
291 |
+
64,
|
292 |
+
64,
|
293 |
+
64,
|
294 |
+
64,
|
295 |
+
64,
|
296 |
+
64,
|
297 |
+
64,
|
298 |
+
64,
|
299 |
+
64,
|
300 |
+
64,
|
301 |
+
64,
|
302 |
+
64,
|
303 |
+
64,
|
304 |
+
64,
|
305 |
+
64,
|
306 |
+
64,
|
307 |
+
64
|
308 |
+
],
|
309 |
+
"device": "cuda:0",
|
310 |
+
"use_cache": null,
|
311 |
+
"limit": null,
|
312 |
+
"bootstrap_iters": 100000,
|
313 |
+
"gen_kwargs": null
|
314 |
+
},
|
315 |
+
"git_hash": "928c7657"
|
316 |
+
}
|
data/raw-eval-outputs/microsoft-phi-2_results.json
ADDED
@@ -0,0 +1,316 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"b4b": {
|
4 |
+
"acc,none": 0.44790382244143034,
|
5 |
+
"acc_stderr,none": 0.0343882858973779,
|
6 |
+
"acc_norm,none": 0.44790382244143034,
|
7 |
+
"acc_norm_stderr,none": 0.0343882858973779,
|
8 |
+
"alias": "b4b"
|
9 |
+
},
|
10 |
+
"b4bqa": {
|
11 |
+
"acc,none": 0.47488839285714285,
|
12 |
+
"acc_stderr,none": 0.01179977682900124,
|
13 |
+
"acc_norm,none": 0.47488839285714285,
|
14 |
+
"acc_norm_stderr,none": 0.01179977682900124,
|
15 |
+
"alias": " - b4bqa"
|
16 |
+
},
|
17 |
+
"medmcqa_g2b": {
|
18 |
+
"acc,none": 0.3764367816091954,
|
19 |
+
"acc_stderr,none": 0.02600887296285643,
|
20 |
+
"acc_norm,none": 0.3764367816091954,
|
21 |
+
"acc_norm_stderr,none": 0.02600887296285643,
|
22 |
+
"alias": " - medmcqa_g2b"
|
23 |
+
},
|
24 |
+
"medmcqa_orig_filtered": {
|
25 |
+
"acc,none": 0.4224137931034483,
|
26 |
+
"acc_stderr,none": 0.02651628723013287,
|
27 |
+
"acc_norm,none": 0.4224137931034483,
|
28 |
+
"acc_norm_stderr,none": 0.02651628723013287,
|
29 |
+
"alias": " - medmcqa_orig_filtered"
|
30 |
+
},
|
31 |
+
"medqa_4options_g2b": {
|
32 |
+
"acc,none": 0.41798941798941797,
|
33 |
+
"acc_stderr,none": 0.02540255550326091,
|
34 |
+
"acc_norm,none": 0.41798941798941797,
|
35 |
+
"acc_norm_stderr,none": 0.02540255550326091,
|
36 |
+
"alias": " - medqa_4options_g2b"
|
37 |
+
},
|
38 |
+
"medqa_4options_orig_filtered": {
|
39 |
+
"acc,none": 0.43915343915343913,
|
40 |
+
"acc_stderr,none": 0.025559920550531003,
|
41 |
+
"acc_norm,none": 0.43915343915343913,
|
42 |
+
"acc_norm_stderr,none": 0.025559920550531003,
|
43 |
+
"alias": " - medqa_4options_orig_filtered"
|
44 |
+
}
|
45 |
+
},
|
46 |
+
"groups": {
|
47 |
+
"b4b": {
|
48 |
+
"acc,none": 0.44790382244143034,
|
49 |
+
"acc_stderr,none": 0.0343882858973779,
|
50 |
+
"acc_norm,none": 0.44790382244143034,
|
51 |
+
"acc_norm_stderr,none": 0.0343882858973779,
|
52 |
+
"alias": "b4b"
|
53 |
+
}
|
54 |
+
},
|
55 |
+
"configs": {
|
56 |
+
"b4bqa": {
|
57 |
+
"task": "b4bqa",
|
58 |
+
"dataset_path": "AIM-Harvard/b4b_drug_qa",
|
59 |
+
"test_split": "test",
|
60 |
+
"doc_to_text": "<function process_cd at 0x7fe111a60ee0>",
|
61 |
+
"doc_to_target": "correct_choice",
|
62 |
+
"doc_to_choice": [
|
63 |
+
"A",
|
64 |
+
"B",
|
65 |
+
"C",
|
66 |
+
"D"
|
67 |
+
],
|
68 |
+
"description": "",
|
69 |
+
"target_delimiter": " ",
|
70 |
+
"fewshot_delimiter": "\n\n",
|
71 |
+
"metric_list": [
|
72 |
+
{
|
73 |
+
"metric": "acc",
|
74 |
+
"aggregation": "mean",
|
75 |
+
"higher_is_better": true
|
76 |
+
},
|
77 |
+
{
|
78 |
+
"metric": "acc_norm",
|
79 |
+
"aggregation": "mean",
|
80 |
+
"higher_is_better": true
|
81 |
+
}
|
82 |
+
],
|
83 |
+
"output_type": "multiple_choice",
|
84 |
+
"repeats": 1,
|
85 |
+
"should_decontaminate": false
|
86 |
+
},
|
87 |
+
"medmcqa_g2b": {
|
88 |
+
"task": "medmcqa_g2b",
|
89 |
+
"dataset_path": "AIM-Harvard/medmcqa_generic_to_brand",
|
90 |
+
"training_split": "train",
|
91 |
+
"validation_split": "validation",
|
92 |
+
"test_split": "validation",
|
93 |
+
"doc_to_text": "<function doc_to_text at 0x7fe111f9e3a0>",
|
94 |
+
"doc_to_target": "cop",
|
95 |
+
"doc_to_choice": [
|
96 |
+
"A",
|
97 |
+
"B",
|
98 |
+
"C",
|
99 |
+
"D"
|
100 |
+
],
|
101 |
+
"description": "",
|
102 |
+
"target_delimiter": " ",
|
103 |
+
"fewshot_delimiter": "\n\n",
|
104 |
+
"metric_list": [
|
105 |
+
{
|
106 |
+
"metric": "acc",
|
107 |
+
"aggregation": "mean",
|
108 |
+
"higher_is_better": true
|
109 |
+
},
|
110 |
+
{
|
111 |
+
"metric": "acc_norm",
|
112 |
+
"aggregation": "mean",
|
113 |
+
"higher_is_better": true
|
114 |
+
}
|
115 |
+
],
|
116 |
+
"output_type": "multiple_choice",
|
117 |
+
"repeats": 1,
|
118 |
+
"should_decontaminate": true,
|
119 |
+
"doc_to_decontamination_query": "{{question}}"
|
120 |
+
},
|
121 |
+
"medmcqa_orig_filtered": {
|
122 |
+
"task": "medmcqa_orig_filtered",
|
123 |
+
"dataset_path": "AIM-Harvard/medmcqa_original",
|
124 |
+
"training_split": "train",
|
125 |
+
"validation_split": "validation",
|
126 |
+
"test_split": "validation",
|
127 |
+
"doc_to_text": "<function doc_to_text at 0x7fe1118cb310>",
|
128 |
+
"doc_to_target": "cop",
|
129 |
+
"doc_to_choice": [
|
130 |
+
"A",
|
131 |
+
"B",
|
132 |
+
"C",
|
133 |
+
"D"
|
134 |
+
],
|
135 |
+
"description": "",
|
136 |
+
"target_delimiter": " ",
|
137 |
+
"fewshot_delimiter": "\n\n",
|
138 |
+
"metric_list": [
|
139 |
+
{
|
140 |
+
"metric": "acc",
|
141 |
+
"aggregation": "mean",
|
142 |
+
"higher_is_better": true
|
143 |
+
},
|
144 |
+
{
|
145 |
+
"metric": "acc_norm",
|
146 |
+
"aggregation": "mean",
|
147 |
+
"higher_is_better": true
|
148 |
+
}
|
149 |
+
],
|
150 |
+
"output_type": "multiple_choice",
|
151 |
+
"repeats": 1,
|
152 |
+
"should_decontaminate": true,
|
153 |
+
"doc_to_decontamination_query": "{{question}}"
|
154 |
+
},
|
155 |
+
"medqa_4options_g2b": {
|
156 |
+
"task": "medqa_4options_g2b",
|
157 |
+
"dataset_path": "AIM-Harvard/gbaker_medqa_usmle_4_options_hf_generic_to_brand",
|
158 |
+
"training_split": "train",
|
159 |
+
"validation_split": "validation",
|
160 |
+
"test_split": "test",
|
161 |
+
"doc_to_text": "<function doc_to_text at 0x7fe111f9e820>",
|
162 |
+
"doc_to_target": "<function doc_to_target at 0x7fe111f9eb80>",
|
163 |
+
"doc_to_choice": [
|
164 |
+
"A",
|
165 |
+
"B",
|
166 |
+
"C",
|
167 |
+
"D"
|
168 |
+
],
|
169 |
+
"description": "",
|
170 |
+
"target_delimiter": " ",
|
171 |
+
"fewshot_delimiter": "\n\n",
|
172 |
+
"metric_list": [
|
173 |
+
{
|
174 |
+
"metric": "acc",
|
175 |
+
"aggregation": "mean",
|
176 |
+
"higher_is_better": true
|
177 |
+
},
|
178 |
+
{
|
179 |
+
"metric": "acc_norm",
|
180 |
+
"aggregation": "mean",
|
181 |
+
"higher_is_better": true
|
182 |
+
}
|
183 |
+
],
|
184 |
+
"output_type": "multiple_choice",
|
185 |
+
"repeats": 1,
|
186 |
+
"should_decontaminate": false
|
187 |
+
},
|
188 |
+
"medqa_4options_orig_filtered": {
|
189 |
+
"task": "medqa_4options_orig_filtered",
|
190 |
+
"dataset_path": "AIM-Harvard/gbaker_medqa_usmle_4_options_hf_original",
|
191 |
+
"training_split": "train",
|
192 |
+
"validation_split": "validation",
|
193 |
+
"test_split": "test",
|
194 |
+
"doc_to_text": "<function doc_to_text at 0x7fe111a834c0>",
|
195 |
+
"doc_to_target": "<function doc_to_target at 0x7fe111a30ee0>",
|
196 |
+
"doc_to_choice": [
|
197 |
+
"A",
|
198 |
+
"B",
|
199 |
+
"C",
|
200 |
+
"D"
|
201 |
+
],
|
202 |
+
"description": "",
|
203 |
+
"target_delimiter": " ",
|
204 |
+
"fewshot_delimiter": "\n\n",
|
205 |
+
"metric_list": [
|
206 |
+
{
|
207 |
+
"metric": "acc",
|
208 |
+
"aggregation": "mean",
|
209 |
+
"higher_is_better": true
|
210 |
+
},
|
211 |
+
{
|
212 |
+
"metric": "acc_norm",
|
213 |
+
"aggregation": "mean",
|
214 |
+
"higher_is_better": true
|
215 |
+
}
|
216 |
+
],
|
217 |
+
"output_type": "multiple_choice",
|
218 |
+
"repeats": 1,
|
219 |
+
"should_decontaminate": false
|
220 |
+
}
|
221 |
+
},
|
222 |
+
"versions": {
|
223 |
+
"b4b": "N/A",
|
224 |
+
"b4bqa": "Yaml",
|
225 |
+
"medmcqa_g2b": "Yaml",
|
226 |
+
"medmcqa_orig_filtered": "Yaml",
|
227 |
+
"medqa_4options_g2b": "Yaml",
|
228 |
+
"medqa_4options_orig_filtered": "Yaml"
|
229 |
+
},
|
230 |
+
"n-shot": {
|
231 |
+
"b4b": 0,
|
232 |
+
"b4bqa": 0,
|
233 |
+
"medmcqa_g2b": 0,
|
234 |
+
"medmcqa_orig_filtered": 0,
|
235 |
+
"medqa_4options_g2b": 0,
|
236 |
+
"medqa_4options_orig_filtered": 0
|
237 |
+
},
|
238 |
+
"config": {
|
239 |
+
"model": "hf",
|
240 |
+
"model_args": "pretrained=microsoft/phi-2,load_in_4bit=True",
|
241 |
+
"batch_size": "auto:64",
|
242 |
+
"batch_sizes": [
|
243 |
+
32,
|
244 |
+
32,
|
245 |
+
64,
|
246 |
+
64,
|
247 |
+
64,
|
248 |
+
64,
|
249 |
+
64,
|
250 |
+
64,
|
251 |
+
64,
|
252 |
+
64,
|
253 |
+
64,
|
254 |
+
64,
|
255 |
+
64,
|
256 |
+
64,
|
257 |
+
64,
|
258 |
+
64,
|
259 |
+
64,
|
260 |
+
64,
|
261 |
+
64,
|
262 |
+
64,
|
263 |
+
64,
|
264 |
+
64,
|
265 |
+
64,
|
266 |
+
64,
|
267 |
+
64,
|
268 |
+
64,
|
269 |
+
64,
|
270 |
+
64,
|
271 |
+
64,
|
272 |
+
64,
|
273 |
+
64,
|
274 |
+
64,
|
275 |
+
64,
|
276 |
+
64,
|
277 |
+
64,
|
278 |
+
64,
|
279 |
+
64,
|
280 |
+
64,
|
281 |
+
64,
|
282 |
+
64,
|
283 |
+
64,
|
284 |
+
64,
|
285 |
+
64,
|
286 |
+
64,
|
287 |
+
64,
|
288 |
+
64,
|
289 |
+
64,
|
290 |
+
64,
|
291 |
+
64,
|
292 |
+
64,
|
293 |
+
64,
|
294 |
+
64,
|
295 |
+
64,
|
296 |
+
64,
|
297 |
+
64,
|
298 |
+
64,
|
299 |
+
64,
|
300 |
+
64,
|
301 |
+
64,
|
302 |
+
64,
|
303 |
+
64,
|
304 |
+
64,
|
305 |
+
64,
|
306 |
+
64,
|
307 |
+
64
|
308 |
+
],
|
309 |
+
"device": "cuda:0",
|
310 |
+
"use_cache": null,
|
311 |
+
"limit": null,
|
312 |
+
"bootstrap_iters": 100000,
|
313 |
+
"gen_kwargs": null
|
314 |
+
},
|
315 |
+
"git_hash": "928c7657"
|
316 |
+
}
|
data/raw-eval-outputs/mistralai-Mistral-7B-v0.3_results.json
ADDED
@@ -0,0 +1,316 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"b4b": {
|
4 |
+
"acc,none": 0.6199136868064118,
|
5 |
+
"acc_stderr,none": 0.0837373393352743,
|
6 |
+
"acc_norm,none": 0.6199136868064118,
|
7 |
+
"acc_norm_stderr,none": 0.0837373393352743,
|
8 |
+
"alias": "b4b"
|
9 |
+
},
|
10 |
+
"b4bqa": {
|
11 |
+
"acc,none": 0.703125,
|
12 |
+
"acc_stderr,none": 0.010795811437682205,
|
13 |
+
"acc_norm,none": 0.703125,
|
14 |
+
"acc_norm_stderr,none": 0.010795811437682205,
|
15 |
+
"alias": " - b4bqa"
|
16 |
+
},
|
17 |
+
"medmcqa_g2b": {
|
18 |
+
"acc,none": 0.4827586206896552,
|
19 |
+
"acc_stderr,none": 0.026825443578224806,
|
20 |
+
"acc_norm,none": 0.4827586206896552,
|
21 |
+
"acc_norm_stderr,none": 0.026825443578224806,
|
22 |
+
"alias": " - medmcqa_g2b"
|
23 |
+
},
|
24 |
+
"medmcqa_orig_filtered": {
|
25 |
+
"acc,none": 0.5689655172413793,
|
26 |
+
"acc_stderr,none": 0.026584851780353615,
|
27 |
+
"acc_norm,none": 0.5689655172413793,
|
28 |
+
"acc_norm_stderr,none": 0.026584851780353615,
|
29 |
+
"alias": " - medmcqa_orig_filtered"
|
30 |
+
},
|
31 |
+
"medqa_4options_g2b": {
|
32 |
+
"acc,none": 0.48677248677248675,
|
33 |
+
"acc_stderr,none": 0.025742297289575142,
|
34 |
+
"acc_norm,none": 0.48677248677248675,
|
35 |
+
"acc_norm_stderr,none": 0.025742297289575142,
|
36 |
+
"alias": " - medqa_4options_g2b"
|
37 |
+
},
|
38 |
+
"medqa_4options_orig_filtered": {
|
39 |
+
"acc,none": 0.5317460317460317,
|
40 |
+
"acc_stderr,none": 0.0256993528321318,
|
41 |
+
"acc_norm,none": 0.5317460317460317,
|
42 |
+
"acc_norm_stderr,none": 0.0256993528321318,
|
43 |
+
"alias": " - medqa_4options_orig_filtered"
|
44 |
+
}
|
45 |
+
},
|
46 |
+
"groups": {
|
47 |
+
"b4b": {
|
48 |
+
"acc,none": 0.6199136868064118,
|
49 |
+
"acc_stderr,none": 0.0837373393352743,
|
50 |
+
"acc_norm,none": 0.6199136868064118,
|
51 |
+
"acc_norm_stderr,none": 0.0837373393352743,
|
52 |
+
"alias": "b4b"
|
53 |
+
}
|
54 |
+
},
|
55 |
+
"configs": {
|
56 |
+
"b4bqa": {
|
57 |
+
"task": "b4bqa",
|
58 |
+
"dataset_path": "AIM-Harvard/b4b_drug_qa",
|
59 |
+
"test_split": "test",
|
60 |
+
"doc_to_text": "<function process_cd at 0x7f6e18c5ff70>",
|
61 |
+
"doc_to_target": "correct_choice",
|
62 |
+
"doc_to_choice": [
|
63 |
+
"A",
|
64 |
+
"B",
|
65 |
+
"C",
|
66 |
+
"D"
|
67 |
+
],
|
68 |
+
"description": "",
|
69 |
+
"target_delimiter": " ",
|
70 |
+
"fewshot_delimiter": "\n\n",
|
71 |
+
"metric_list": [
|
72 |
+
{
|
73 |
+
"metric": "acc",
|
74 |
+
"aggregation": "mean",
|
75 |
+
"higher_is_better": true
|
76 |
+
},
|
77 |
+
{
|
78 |
+
"metric": "acc_norm",
|
79 |
+
"aggregation": "mean",
|
80 |
+
"higher_is_better": true
|
81 |
+
}
|
82 |
+
],
|
83 |
+
"output_type": "multiple_choice",
|
84 |
+
"repeats": 1,
|
85 |
+
"should_decontaminate": false
|
86 |
+
},
|
87 |
+
"medmcqa_g2b": {
|
88 |
+
"task": "medmcqa_g2b",
|
89 |
+
"dataset_path": "AIM-Harvard/medmcqa_generic_to_brand",
|
90 |
+
"training_split": "train",
|
91 |
+
"validation_split": "validation",
|
92 |
+
"test_split": "validation",
|
93 |
+
"doc_to_text": "<function doc_to_text at 0x7f6e1919e430>",
|
94 |
+
"doc_to_target": "cop",
|
95 |
+
"doc_to_choice": [
|
96 |
+
"A",
|
97 |
+
"B",
|
98 |
+
"C",
|
99 |
+
"D"
|
100 |
+
],
|
101 |
+
"description": "",
|
102 |
+
"target_delimiter": " ",
|
103 |
+
"fewshot_delimiter": "\n\n",
|
104 |
+
"metric_list": [
|
105 |
+
{
|
106 |
+
"metric": "acc",
|
107 |
+
"aggregation": "mean",
|
108 |
+
"higher_is_better": true
|
109 |
+
},
|
110 |
+
{
|
111 |
+
"metric": "acc_norm",
|
112 |
+
"aggregation": "mean",
|
113 |
+
"higher_is_better": true
|
114 |
+
}
|
115 |
+
],
|
116 |
+
"output_type": "multiple_choice",
|
117 |
+
"repeats": 1,
|
118 |
+
"should_decontaminate": true,
|
119 |
+
"doc_to_decontamination_query": "{{question}}"
|
120 |
+
},
|
121 |
+
"medmcqa_orig_filtered": {
|
122 |
+
"task": "medmcqa_orig_filtered",
|
123 |
+
"dataset_path": "AIM-Harvard/medmcqa_original",
|
124 |
+
"training_split": "train",
|
125 |
+
"validation_split": "validation",
|
126 |
+
"test_split": "validation",
|
127 |
+
"doc_to_text": "<function doc_to_text at 0x7f6e18acb3a0>",
|
128 |
+
"doc_to_target": "cop",
|
129 |
+
"doc_to_choice": [
|
130 |
+
"A",
|
131 |
+
"B",
|
132 |
+
"C",
|
133 |
+
"D"
|
134 |
+
],
|
135 |
+
"description": "",
|
136 |
+
"target_delimiter": " ",
|
137 |
+
"fewshot_delimiter": "\n\n",
|
138 |
+
"metric_list": [
|
139 |
+
{
|
140 |
+
"metric": "acc",
|
141 |
+
"aggregation": "mean",
|
142 |
+
"higher_is_better": true
|
143 |
+
},
|
144 |
+
{
|
145 |
+
"metric": "acc_norm",
|
146 |
+
"aggregation": "mean",
|
147 |
+
"higher_is_better": true
|
148 |
+
}
|
149 |
+
],
|
150 |
+
"output_type": "multiple_choice",
|
151 |
+
"repeats": 1,
|
152 |
+
"should_decontaminate": true,
|
153 |
+
"doc_to_decontamination_query": "{{question}}"
|
154 |
+
},
|
155 |
+
"medqa_4options_g2b": {
|
156 |
+
"task": "medqa_4options_g2b",
|
157 |
+
"dataset_path": "AIM-Harvard/gbaker_medqa_usmle_4_options_hf_generic_to_brand",
|
158 |
+
"training_split": "train",
|
159 |
+
"validation_split": "validation",
|
160 |
+
"test_split": "test",
|
161 |
+
"doc_to_text": "<function doc_to_text at 0x7f6e1919e8b0>",
|
162 |
+
"doc_to_target": "<function doc_to_target at 0x7f6e1919ec10>",
|
163 |
+
"doc_to_choice": [
|
164 |
+
"A",
|
165 |
+
"B",
|
166 |
+
"C",
|
167 |
+
"D"
|
168 |
+
],
|
169 |
+
"description": "",
|
170 |
+
"target_delimiter": " ",
|
171 |
+
"fewshot_delimiter": "\n\n",
|
172 |
+
"metric_list": [
|
173 |
+
{
|
174 |
+
"metric": "acc",
|
175 |
+
"aggregation": "mean",
|
176 |
+
"higher_is_better": true
|
177 |
+
},
|
178 |
+
{
|
179 |
+
"metric": "acc_norm",
|
180 |
+
"aggregation": "mean",
|
181 |
+
"higher_is_better": true
|
182 |
+
}
|
183 |
+
],
|
184 |
+
"output_type": "multiple_choice",
|
185 |
+
"repeats": 1,
|
186 |
+
"should_decontaminate": false
|
187 |
+
},
|
188 |
+
"medqa_4options_orig_filtered": {
|
189 |
+
"task": "medqa_4options_orig_filtered",
|
190 |
+
"dataset_path": "AIM-Harvard/gbaker_medqa_usmle_4_options_hf_original",
|
191 |
+
"training_split": "train",
|
192 |
+
"validation_split": "validation",
|
193 |
+
"test_split": "test",
|
194 |
+
"doc_to_text": "<function doc_to_text at 0x7f6e18c80550>",
|
195 |
+
"doc_to_target": "<function doc_to_target at 0x7f6e18c2ff70>",
|
196 |
+
"doc_to_choice": [
|
197 |
+
"A",
|
198 |
+
"B",
|
199 |
+
"C",
|
200 |
+
"D"
|
201 |
+
],
|
202 |
+
"description": "",
|
203 |
+
"target_delimiter": " ",
|
204 |
+
"fewshot_delimiter": "\n\n",
|
205 |
+
"metric_list": [
|
206 |
+
{
|
207 |
+
"metric": "acc",
|
208 |
+
"aggregation": "mean",
|
209 |
+
"higher_is_better": true
|
210 |
+
},
|
211 |
+
{
|
212 |
+
"metric": "acc_norm",
|
213 |
+
"aggregation": "mean",
|
214 |
+
"higher_is_better": true
|
215 |
+
}
|
216 |
+
],
|
217 |
+
"output_type": "multiple_choice",
|
218 |
+
"repeats": 1,
|
219 |
+
"should_decontaminate": false
|
220 |
+
}
|
221 |
+
},
|
222 |
+
"versions": {
|
223 |
+
"b4b": "N/A",
|
224 |
+
"b4bqa": "Yaml",
|
225 |
+
"medmcqa_g2b": "Yaml",
|
226 |
+
"medmcqa_orig_filtered": "Yaml",
|
227 |
+
"medqa_4options_g2b": "Yaml",
|
228 |
+
"medqa_4options_orig_filtered": "Yaml"
|
229 |
+
},
|
230 |
+
"n-shot": {
|
231 |
+
"b4b": 0,
|
232 |
+
"b4bqa": 0,
|
233 |
+
"medmcqa_g2b": 0,
|
234 |
+
"medmcqa_orig_filtered": 0,
|
235 |
+
"medqa_4options_g2b": 0,
|
236 |
+
"medqa_4options_orig_filtered": 0
|
237 |
+
},
|
238 |
+
"config": {
|
239 |
+
"model": "hf",
|
240 |
+
"model_args": "pretrained=mistralai/Mistral-7B-v0.3,load_in_4bit=True",
|
241 |
+
"batch_size": "auto:64",
|
242 |
+
"batch_sizes": [
|
243 |
+
32,
|
244 |
+
64,
|
245 |
+
64,
|
246 |
+
64,
|
247 |
+
64,
|
248 |
+
64,
|
249 |
+
64,
|
250 |
+
64,
|
251 |
+
64,
|
252 |
+
64,
|
253 |
+
64,
|
254 |
+
64,
|
255 |
+
64,
|
256 |
+
64,
|
257 |
+
64,
|
258 |
+
64,
|
259 |
+
64,
|
260 |
+
64,
|
261 |
+
64,
|
262 |
+
64,
|
263 |
+
64,
|
264 |
+
64,
|
265 |
+
64,
|
266 |
+
64,
|
267 |
+
64,
|
268 |
+
64,
|
269 |
+
64,
|
270 |
+
64,
|
271 |
+
64,
|
272 |
+
64,
|
273 |
+
64,
|
274 |
+
64,
|
275 |
+
64,
|
276 |
+
64,
|
277 |
+
64,
|
278 |
+
64,
|
279 |
+
64,
|
280 |
+
64,
|
281 |
+
64,
|
282 |
+
64,
|
283 |
+
64,
|
284 |
+
64,
|
285 |
+
64,
|
286 |
+
64,
|
287 |
+
64,
|
288 |
+
64,
|
289 |
+
64,
|
290 |
+
64,
|
291 |
+
64,
|
292 |
+
64,
|
293 |
+
64,
|
294 |
+
64,
|
295 |
+
64,
|
296 |
+
64,
|
297 |
+
64,
|
298 |
+
64,
|
299 |
+
64,
|
300 |
+
64,
|
301 |
+
64,
|
302 |
+
64,
|
303 |
+
64,
|
304 |
+
64,
|
305 |
+
64,
|
306 |
+
64,
|
307 |
+
64
|
308 |
+
],
|
309 |
+
"device": "cuda:0",
|
310 |
+
"use_cache": null,
|
311 |
+
"limit": null,
|
312 |
+
"bootstrap_iters": 100000,
|
313 |
+
"gen_kwargs": null
|
314 |
+
},
|
315 |
+
"git_hash": "928c7657"
|
316 |
+
}
|
data/raw-eval-outputs/mistralai-Mixtral-8x22B-v0.1_results.json
ADDED
@@ -0,0 +1,252 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"b4b": {
|
4 |
+
"acc,none": 0.7882244143033293,
|
5 |
+
"acc_stderr,none": 0.08841138813945006,
|
6 |
+
"acc_norm,none": 0.7882244143033293,
|
7 |
+
"acc_norm_stderr,none": 0.08841138813945006,
|
8 |
+
"alias": "b4b"
|
9 |
+
},
|
10 |
+
"b4bqa": {
|
11 |
+
"acc,none": 0.8772321428571429,
|
12 |
+
"acc_stderr,none": 0.007754464516034243,
|
13 |
+
"acc_norm,none": 0.8772321428571429,
|
14 |
+
"acc_norm_stderr,none": 0.007754464516034243,
|
15 |
+
"alias": " - b4bqa"
|
16 |
+
},
|
17 |
+
"medmcqa_g2b": {
|
18 |
+
"acc,none": 0.617816091954023,
|
19 |
+
"acc_stderr,none": 0.026085614333362674,
|
20 |
+
"acc_norm,none": 0.617816091954023,
|
21 |
+
"acc_norm_stderr,none": 0.026085614333362674,
|
22 |
+
"alias": " - medmcqa_g2b"
|
23 |
+
},
|
24 |
+
"medmcqa_orig_filtered": {
|
25 |
+
"acc,none": 0.7040229885057471,
|
26 |
+
"acc_stderr,none": 0.024505167376090542,
|
27 |
+
"acc_norm,none": 0.7040229885057471,
|
28 |
+
"acc_norm_stderr,none": 0.024505167376090542,
|
29 |
+
"alias": " - medmcqa_orig_filtered"
|
30 |
+
},
|
31 |
+
"medqa_4options_g2b": {
|
32 |
+
"acc,none": 0.6746031746031746,
|
33 |
+
"acc_stderr,none": 0.024130158299762613,
|
34 |
+
"acc_norm,none": 0.6746031746031746,
|
35 |
+
"acc_norm_stderr,none": 0.024130158299762613,
|
36 |
+
"alias": " - medqa_4options_g2b"
|
37 |
+
},
|
38 |
+
"medqa_4options_orig_filtered": {
|
39 |
+
"acc,none": 0.7142857142857143,
|
40 |
+
"acc_stderr,none": 0.023266512213730585,
|
41 |
+
"acc_norm,none": 0.7142857142857143,
|
42 |
+
"acc_norm_stderr,none": 0.023266512213730585,
|
43 |
+
"alias": " - medqa_4options_orig_filtered"
|
44 |
+
}
|
45 |
+
},
|
46 |
+
"groups": {
|
47 |
+
"b4b": {
|
48 |
+
"acc,none": 0.7882244143033293,
|
49 |
+
"acc_stderr,none": 0.08841138813945006,
|
50 |
+
"acc_norm,none": 0.7882244143033293,
|
51 |
+
"acc_norm_stderr,none": 0.08841138813945006,
|
52 |
+
"alias": "b4b"
|
53 |
+
}
|
54 |
+
},
|
55 |
+
"configs": {
|
56 |
+
"b4bqa": {
|
57 |
+
"task": "b4bqa",
|
58 |
+
"dataset_path": "AIM-Harvard/b4b_drug_qa",
|
59 |
+
"test_split": "test",
|
60 |
+
"doc_to_text": "<function process_cd at 0x7ff2d0094820>",
|
61 |
+
"doc_to_target": "correct_choice",
|
62 |
+
"doc_to_choice": [
|
63 |
+
"A",
|
64 |
+
"B",
|
65 |
+
"C",
|
66 |
+
"D"
|
67 |
+
],
|
68 |
+
"description": "",
|
69 |
+
"target_delimiter": " ",
|
70 |
+
"fewshot_delimiter": "\n\n",
|
71 |
+
"metric_list": [
|
72 |
+
{
|
73 |
+
"metric": "acc",
|
74 |
+
"aggregation": "mean",
|
75 |
+
"higher_is_better": true
|
76 |
+
},
|
77 |
+
{
|
78 |
+
"metric": "acc_norm",
|
79 |
+
"aggregation": "mean",
|
80 |
+
"higher_is_better": true
|
81 |
+
}
|
82 |
+
],
|
83 |
+
"output_type": "multiple_choice",
|
84 |
+
"repeats": 1,
|
85 |
+
"should_decontaminate": false
|
86 |
+
},
|
87 |
+
"medmcqa_g2b": {
|
88 |
+
"task": "medmcqa_g2b",
|
89 |
+
"dataset_path": "AIM-Harvard/medmcqa_generic_to_brand",
|
90 |
+
"training_split": "train",
|
91 |
+
"validation_split": "validation",
|
92 |
+
"test_split": "validation",
|
93 |
+
"doc_to_text": "<function doc_to_text at 0x7ff2d00de050>",
|
94 |
+
"doc_to_target": "cop",
|
95 |
+
"doc_to_choice": [
|
96 |
+
"A",
|
97 |
+
"B",
|
98 |
+
"C",
|
99 |
+
"D"
|
100 |
+
],
|
101 |
+
"description": "",
|
102 |
+
"target_delimiter": " ",
|
103 |
+
"fewshot_delimiter": "\n\n",
|
104 |
+
"metric_list": [
|
105 |
+
{
|
106 |
+
"metric": "acc",
|
107 |
+
"aggregation": "mean",
|
108 |
+
"higher_is_better": true
|
109 |
+
},
|
110 |
+
{
|
111 |
+
"metric": "acc_norm",
|
112 |
+
"aggregation": "mean",
|
113 |
+
"higher_is_better": true
|
114 |
+
}
|
115 |
+
],
|
116 |
+
"output_type": "multiple_choice",
|
117 |
+
"repeats": 1,
|
118 |
+
"should_decontaminate": true,
|
119 |
+
"doc_to_decontamination_query": "{{question}}"
|
120 |
+
},
|
121 |
+
"medmcqa_orig_filtered": {
|
122 |
+
"task": "medmcqa_orig_filtered",
|
123 |
+
"dataset_path": "AIM-Harvard/medmcqa_original",
|
124 |
+
"training_split": "train",
|
125 |
+
"validation_split": "validation",
|
126 |
+
"test_split": "validation",
|
127 |
+
"doc_to_text": "<function doc_to_text at 0x7ff2c2d29240>",
|
128 |
+
"doc_to_target": "cop",
|
129 |
+
"doc_to_choice": [
|
130 |
+
"A",
|
131 |
+
"B",
|
132 |
+
"C",
|
133 |
+
"D"
|
134 |
+
],
|
135 |
+
"description": "",
|
136 |
+
"target_delimiter": " ",
|
137 |
+
"fewshot_delimiter": "\n\n",
|
138 |
+
"metric_list": [
|
139 |
+
{
|
140 |
+
"metric": "acc",
|
141 |
+
"aggregation": "mean",
|
142 |
+
"higher_is_better": true
|
143 |
+
},
|
144 |
+
{
|
145 |
+
"metric": "acc_norm",
|
146 |
+
"aggregation": "mean",
|
147 |
+
"higher_is_better": true
|
148 |
+
}
|
149 |
+
],
|
150 |
+
"output_type": "multiple_choice",
|
151 |
+
"repeats": 1,
|
152 |
+
"should_decontaminate": true,
|
153 |
+
"doc_to_decontamination_query": "{{question}}"
|
154 |
+
},
|
155 |
+
"medqa_4options_g2b": {
|
156 |
+
"task": "medqa_4options_g2b",
|
157 |
+
"dataset_path": "AIM-Harvard/gbaker_medqa_usmle_4_options_hf_generic_to_brand",
|
158 |
+
"training_split": "train",
|
159 |
+
"validation_split": "validation",
|
160 |
+
"test_split": "test",
|
161 |
+
"doc_to_text": "<function doc_to_text at 0x7ff2c4f34820>",
|
162 |
+
"doc_to_target": "<function doc_to_target at 0x7ff2c4f34b80>",
|
163 |
+
"doc_to_choice": [
|
164 |
+
"A",
|
165 |
+
"B",
|
166 |
+
"C",
|
167 |
+
"D"
|
168 |
+
],
|
169 |
+
"description": "",
|
170 |
+
"target_delimiter": " ",
|
171 |
+
"fewshot_delimiter": "\n\n",
|
172 |
+
"metric_list": [
|
173 |
+
{
|
174 |
+
"metric": "acc",
|
175 |
+
"aggregation": "mean",
|
176 |
+
"higher_is_better": true
|
177 |
+
},
|
178 |
+
{
|
179 |
+
"metric": "acc_norm",
|
180 |
+
"aggregation": "mean",
|
181 |
+
"higher_is_better": true
|
182 |
+
}
|
183 |
+
],
|
184 |
+
"output_type": "multiple_choice",
|
185 |
+
"repeats": 1,
|
186 |
+
"should_decontaminate": false
|
187 |
+
},
|
188 |
+
"medqa_4options_orig_filtered": {
|
189 |
+
"task": "medqa_4options_orig_filtered",
|
190 |
+
"dataset_path": "AIM-Harvard/gbaker_medqa_usmle_4_options_hf_original",
|
191 |
+
"training_split": "train",
|
192 |
+
"validation_split": "validation",
|
193 |
+
"test_split": "test",
|
194 |
+
"doc_to_text": "<function doc_to_text at 0x7ff2d005a680>",
|
195 |
+
"doc_to_target": "<function doc_to_target at 0x7ff2d0079120>",
|
196 |
+
"doc_to_choice": [
|
197 |
+
"A",
|
198 |
+
"B",
|
199 |
+
"C",
|
200 |
+
"D"
|
201 |
+
],
|
202 |
+
"description": "",
|
203 |
+
"target_delimiter": " ",
|
204 |
+
"fewshot_delimiter": "\n\n",
|
205 |
+
"metric_list": [
|
206 |
+
{
|
207 |
+
"metric": "acc",
|
208 |
+
"aggregation": "mean",
|
209 |
+
"higher_is_better": true
|
210 |
+
},
|
211 |
+
{
|
212 |
+
"metric": "acc_norm",
|
213 |
+
"aggregation": "mean",
|
214 |
+
"higher_is_better": true
|
215 |
+
}
|
216 |
+
],
|
217 |
+
"output_type": "multiple_choice",
|
218 |
+
"repeats": 1,
|
219 |
+
"should_decontaminate": false
|
220 |
+
}
|
221 |
+
},
|
222 |
+
"versions": {
|
223 |
+
"b4b": "N/A",
|
224 |
+
"b4bqa": "Yaml",
|
225 |
+
"medmcqa_g2b": "Yaml",
|
226 |
+
"medmcqa_orig_filtered": "Yaml",
|
227 |
+
"medqa_4options_g2b": "Yaml",
|
228 |
+
"medqa_4options_orig_filtered": "Yaml"
|
229 |
+
},
|
230 |
+
"n-shot": {
|
231 |
+
"b4b": 0,
|
232 |
+
"b4bqa": 0,
|
233 |
+
"medmcqa_g2b": 0,
|
234 |
+
"medmcqa_orig_filtered": 0,
|
235 |
+
"medqa_4options_g2b": 0,
|
236 |
+
"medqa_4options_orig_filtered": 0
|
237 |
+
},
|
238 |
+
"config": {
|
239 |
+
"model": "hf",
|
240 |
+
"model_args": "pretrained=mistralai/Mixtral-8x22B-v0.1,parallelize=True,load_in_4bit=True",
|
241 |
+
"batch_size": "auto",
|
242 |
+
"batch_sizes": [
|
243 |
+
32
|
244 |
+
],
|
245 |
+
"device": null,
|
246 |
+
"use_cache": null,
|
247 |
+
"limit": null,
|
248 |
+
"bootstrap_iters": 100000,
|
249 |
+
"gen_kwargs": null
|
250 |
+
},
|
251 |
+
"git_hash": "928c7657"
|
252 |
+
}
|
data/raw-eval-outputs/mistralai-Mixtral-8x7B-v0.1_results.json
ADDED
@@ -0,0 +1,250 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"b4b": {
|
4 |
+
"acc,none": 0.7475339087546239,
|
5 |
+
"acc_stderr,none": 0.11087824048509952,
|
6 |
+
"acc_norm,none": 0.7475339087546239,
|
7 |
+
"acc_norm_stderr,none": 0.11087824048509952,
|
8 |
+
"alias": "b4b"
|
9 |
+
},
|
10 |
+
"b4bqa": {
|
11 |
+
"acc,none": 0.8610491071428571,
|
12 |
+
"acc_stderr,none": 0.008173288677884256,
|
13 |
+
"acc_norm,none": 0.8610491071428571,
|
14 |
+
"acc_norm_stderr,none": 0.008173288677884256,
|
15 |
+
"alias": " - b4bqa"
|
16 |
+
},
|
17 |
+
"medmcqa_g2b": {
|
18 |
+
"acc,none": 0.5545977011494253,
|
19 |
+
"acc_stderr,none": 0.026680902895795475,
|
20 |
+
"acc_norm,none": 0.5545977011494253,
|
21 |
+
"acc_norm_stderr,none": 0.026680902895795475,
|
22 |
+
"alias": " - medmcqa_g2b"
|
23 |
+
},
|
24 |
+
"medmcqa_orig_filtered": {
|
25 |
+
"acc,none": 0.6494252873563219,
|
26 |
+
"acc_stderr,none": 0.025614751890362768,
|
27 |
+
"acc_norm,none": 0.6494252873563219,
|
28 |
+
"acc_norm_stderr,none": 0.025614751890362768,
|
29 |
+
"alias": " - medmcqa_orig_filtered"
|
30 |
+
},
|
31 |
+
"medqa_4options_g2b": {
|
32 |
+
"acc,none": 0.6005291005291006,
|
33 |
+
"acc_stderr,none": 0.025225450284067932,
|
34 |
+
"acc_norm,none": 0.6005291005291006,
|
35 |
+
"acc_norm_stderr,none": 0.025225450284067932,
|
36 |
+
"alias": " - medqa_4options_g2b"
|
37 |
+
},
|
38 |
+
"medqa_4options_orig_filtered": {
|
39 |
+
"acc,none": 0.6243386243386243,
|
40 |
+
"acc_stderr,none": 0.02494236893115979,
|
41 |
+
"acc_norm,none": 0.6243386243386243,
|
42 |
+
"acc_norm_stderr,none": 0.02494236893115979,
|
43 |
+
"alias": " - medqa_4options_orig_filtered"
|
44 |
+
}
|
45 |
+
},
|
46 |
+
"groups": {
|
47 |
+
"b4b": {
|
48 |
+
"acc,none": 0.7475339087546239,
|
49 |
+
"acc_stderr,none": 0.11087824048509952,
|
50 |
+
"acc_norm,none": 0.7475339087546239,
|
51 |
+
"acc_norm_stderr,none": 0.11087824048509952,
|
52 |
+
"alias": "b4b"
|
53 |
+
}
|
54 |
+
},
|
55 |
+
"configs": {
|
56 |
+
"b4bqa": {
|
57 |
+
"task": "b4bqa",
|
58 |
+
"dataset_path": "AIM-Harvard/b4b_drug_qa",
|
59 |
+
"test_split": "test",
|
60 |
+
"doc_to_text": "<function process_cd at 0x7fb0afadd090>",
|
61 |
+
"doc_to_target": "correct_choice",
|
62 |
+
"doc_to_choice": [
|
63 |
+
"A",
|
64 |
+
"B",
|
65 |
+
"C",
|
66 |
+
"D"
|
67 |
+
],
|
68 |
+
"description": "",
|
69 |
+
"target_delimiter": " ",
|
70 |
+
"fewshot_delimiter": "\n\n",
|
71 |
+
"metric_list": [
|
72 |
+
{
|
73 |
+
"metric": "acc",
|
74 |
+
"aggregation": "mean",
|
75 |
+
"higher_is_better": true
|
76 |
+
},
|
77 |
+
{
|
78 |
+
"metric": "acc_norm",
|
79 |
+
"aggregation": "mean",
|
80 |
+
"higher_is_better": true
|
81 |
+
}
|
82 |
+
],
|
83 |
+
"output_type": "multiple_choice",
|
84 |
+
"repeats": 1,
|
85 |
+
"should_decontaminate": false
|
86 |
+
},
|
87 |
+
"medmcqa_g2b": {
|
88 |
+
"task": "medmcqa_g2b",
|
89 |
+
"dataset_path": "AIM-Harvard/medmcqa_generic_to_brand",
|
90 |
+
"training_split": "train",
|
91 |
+
"validation_split": "validation",
|
92 |
+
"test_split": "validation",
|
93 |
+
"doc_to_text": "<function doc_to_text at 0x7fb0adf3c550>",
|
94 |
+
"doc_to_target": "cop",
|
95 |
+
"doc_to_choice": [
|
96 |
+
"A",
|
97 |
+
"B",
|
98 |
+
"C",
|
99 |
+
"D"
|
100 |
+
],
|
101 |
+
"description": "",
|
102 |
+
"target_delimiter": " ",
|
103 |
+
"fewshot_delimiter": "\n\n",
|
104 |
+
"metric_list": [
|
105 |
+
{
|
106 |
+
"metric": "acc",
|
107 |
+
"aggregation": "mean",
|
108 |
+
"higher_is_better": true
|
109 |
+
},
|
110 |
+
{
|
111 |
+
"metric": "acc_norm",
|
112 |
+
"aggregation": "mean",
|
113 |
+
"higher_is_better": true
|
114 |
+
}
|
115 |
+
],
|
116 |
+
"output_type": "multiple_choice",
|
117 |
+
"repeats": 1,
|
118 |
+
"should_decontaminate": true,
|
119 |
+
"doc_to_decontamination_query": "{{question}}"
|
120 |
+
},
|
121 |
+
"medmcqa_orig_filtered": {
|
122 |
+
"task": "medmcqa_orig_filtered",
|
123 |
+
"dataset_path": "AIM-Harvard/medmcqa_original",
|
124 |
+
"training_split": "train",
|
125 |
+
"validation_split": "validation",
|
126 |
+
"test_split": "validation",
|
127 |
+
"doc_to_text": "<function doc_to_text at 0x7fb0adf3c700>",
|
128 |
+
"doc_to_target": "cop",
|
129 |
+
"doc_to_choice": [
|
130 |
+
"A",
|
131 |
+
"B",
|
132 |
+
"C",
|
133 |
+
"D"
|
134 |
+
],
|
135 |
+
"description": "",
|
136 |
+
"target_delimiter": " ",
|
137 |
+
"fewshot_delimiter": "\n\n",
|
138 |
+
"metric_list": [
|
139 |
+
{
|
140 |
+
"metric": "acc",
|
141 |
+
"aggregation": "mean",
|
142 |
+
"higher_is_better": true
|
143 |
+
},
|
144 |
+
{
|
145 |
+
"metric": "acc_norm",
|
146 |
+
"aggregation": "mean",
|
147 |
+
"higher_is_better": true
|
148 |
+
}
|
149 |
+
],
|
150 |
+
"output_type": "multiple_choice",
|
151 |
+
"repeats": 1,
|
152 |
+
"should_decontaminate": true,
|
153 |
+
"doc_to_decontamination_query": "{{question}}"
|
154 |
+
},
|
155 |
+
"medqa_4options_g2b": {
|
156 |
+
"task": "medqa_4options_g2b",
|
157 |
+
"dataset_path": "AIM-Harvard/gbaker_medqa_usmle_4_options_hf_generic_to_brand",
|
158 |
+
"training_split": "train",
|
159 |
+
"validation_split": "validation",
|
160 |
+
"test_split": "test",
|
161 |
+
"doc_to_text": "<function doc_to_text at 0x7fb0adf3f6d0>",
|
162 |
+
"doc_to_target": "<function doc_to_target at 0x7fb0adf3fa30>",
|
163 |
+
"doc_to_choice": [
|
164 |
+
"A",
|
165 |
+
"B",
|
166 |
+
"C",
|
167 |
+
"D"
|
168 |
+
],
|
169 |
+
"description": "",
|
170 |
+
"target_delimiter": " ",
|
171 |
+
"fewshot_delimiter": "\n\n",
|
172 |
+
"metric_list": [
|
173 |
+
{
|
174 |
+
"metric": "acc",
|
175 |
+
"aggregation": "mean",
|
176 |
+
"higher_is_better": true
|
177 |
+
},
|
178 |
+
{
|
179 |
+
"metric": "acc_norm",
|
180 |
+
"aggregation": "mean",
|
181 |
+
"higher_is_better": true
|
182 |
+
}
|
183 |
+
],
|
184 |
+
"output_type": "multiple_choice",
|
185 |
+
"repeats": 1,
|
186 |
+
"should_decontaminate": false
|
187 |
+
},
|
188 |
+
"medqa_4options_orig_filtered": {
|
189 |
+
"task": "medqa_4options_orig_filtered",
|
190 |
+
"dataset_path": "AIM-Harvard/gbaker_medqa_usmle_4_options_hf_original",
|
191 |
+
"training_split": "train",
|
192 |
+
"validation_split": "validation",
|
193 |
+
"test_split": "test",
|
194 |
+
"doc_to_text": "<function doc_to_text at 0x7fb0adf3fc70>",
|
195 |
+
"doc_to_target": "<function doc_to_target at 0x7fb0adf3feb0>",
|
196 |
+
"doc_to_choice": [
|
197 |
+
"A",
|
198 |
+
"B",
|
199 |
+
"C",
|
200 |
+
"D"
|
201 |
+
],
|
202 |
+
"description": "",
|
203 |
+
"target_delimiter": " ",
|
204 |
+
"fewshot_delimiter": "\n\n",
|
205 |
+
"metric_list": [
|
206 |
+
{
|
207 |
+
"metric": "acc",
|
208 |
+
"aggregation": "mean",
|
209 |
+
"higher_is_better": true
|
210 |
+
},
|
211 |
+
{
|
212 |
+
"metric": "acc_norm",
|
213 |
+
"aggregation": "mean",
|
214 |
+
"higher_is_better": true
|
215 |
+
}
|
216 |
+
],
|
217 |
+
"output_type": "multiple_choice",
|
218 |
+
"repeats": 1,
|
219 |
+
"should_decontaminate": false
|
220 |
+
}
|
221 |
+
},
|
222 |
+
"versions": {
|
223 |
+
"b4b": "N/A",
|
224 |
+
"b4bqa": "Yaml",
|
225 |
+
"medmcqa_g2b": "Yaml",
|
226 |
+
"medmcqa_orig_filtered": "Yaml",
|
227 |
+
"medqa_4options_g2b": "Yaml",
|
228 |
+
"medqa_4options_orig_filtered": "Yaml"
|
229 |
+
},
|
230 |
+
"n-shot": {
|
231 |
+
"b4b": 0,
|
232 |
+
"b4bqa": 0,
|
233 |
+
"medmcqa_g2b": 0,
|
234 |
+
"medmcqa_orig_filtered": 0,
|
235 |
+
"medqa_4options_g2b": 0,
|
236 |
+
"medqa_4options_orig_filtered": 0
|
237 |
+
},
|
238 |
+
"config": {
|
239 |
+
"model": "hf",
|
240 |
+
"model_args": "pretrained=mistralai/Mixtral-8x7B-v0.1,load_in_4bit=True",
|
241 |
+
"batch_size": "4",
|
242 |
+
"batch_sizes": [],
|
243 |
+
"device": "cuda:0",
|
244 |
+
"use_cache": null,
|
245 |
+
"limit": null,
|
246 |
+
"bootstrap_iters": 100000,
|
247 |
+
"gen_kwargs": null
|
248 |
+
},
|
249 |
+
"git_hash": "928c7657"
|
250 |
+
}
|
src/__pycache__/model_links.cpython-311.pyc
ADDED
Binary file (1.97 kB). View file
|
|
src/__pycache__/models_info.cpython-311.pyc
ADDED
Binary file (2.43 kB). View file
|
|
src/json2df.py
ADDED
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import json
|
3 |
+
import pandas as pd
|
4 |
+
from models_info import model_info
|
5 |
+
|
6 |
+
directory = 'data/raw-eval-outputs'
|
7 |
+
data = []
|
8 |
+
|
9 |
+
def model_hyperlink(link, model_name):
|
10 |
+
return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
|
11 |
+
|
12 |
+
def make_clickable_names(df):
|
13 |
+
df["Model"] = df.apply(
|
14 |
+
lambda row: model_hyperlink(row["Link"], row["Model"]), axis=1
|
15 |
+
)
|
16 |
+
return df
|
17 |
+
|
18 |
+
# Iterate over all the files in the directory
|
19 |
+
for filename in os.listdir(directory):
|
20 |
+
if filename.endswith(".json"):
|
21 |
+
filepath = os.path.join(directory, filename)
|
22 |
+
with open(filepath, 'r') as f:
|
23 |
+
json_data = json.load(f)
|
24 |
+
model_name = filename.replace("_results.json", "")
|
25 |
+
|
26 |
+
# Extract the accuracy values
|
27 |
+
results = json_data['results']
|
28 |
+
row = {'Model': model_name}
|
29 |
+
for key, value in results.items():
|
30 |
+
row[key] = round(value['acc,none'] * 100, 2)
|
31 |
+
|
32 |
+
# Add the tuning type and link to the row
|
33 |
+
row['T'] = model_info[model_name]['tuning']
|
34 |
+
row['Link'] = model_info[model_name]['link']
|
35 |
+
|
36 |
+
data.append(row)
|
37 |
+
|
38 |
+
|
39 |
+
df = pd.DataFrame(data)
|
40 |
+
df = make_clickable_names(df)
|
41 |
+
df.drop(columns=["Link"], inplace=True)
|
42 |
+
|
43 |
+
df['medmcqa_diff'] = (df['medmcqa_g2b'] - df['medmcqa_orig_filtered']).round(2)
|
44 |
+
df['medqa_diff'] = (df['medqa_4options_g2b'] - df['medqa_4options_orig_filtered']).round(2)
|
45 |
+
|
46 |
+
# Reorder columns
|
47 |
+
cols = [
|
48 |
+
"T",
|
49 |
+
"Model",
|
50 |
+
"b4bqa",
|
51 |
+
"b4b",
|
52 |
+
"medmcqa_g2b",
|
53 |
+
"medmcqa_orig_filtered",
|
54 |
+
"medmcqa_diff",
|
55 |
+
"medqa_4options_g2b",
|
56 |
+
"medqa_4options_orig_filtered",
|
57 |
+
"medqa_diff"
|
58 |
+
] + [col for col in df.columns if col not in [
|
59 |
+
"T", "Model", "b4bqa", "b4b", "medmcqa_g2b", "medmcqa_orig_filtered", "medmcqa_diff", "medqa_4options_g2b", "medqa_4options_orig_filtered", "medqa_diff"
|
60 |
+
]]
|
61 |
+
df = df[cols]
|
62 |
+
|
63 |
+
|
64 |
+
output_csv = 'data/csv/models_data.csv'
|
65 |
+
df.to_csv(output_csv, index=False)
|
66 |
+
|
67 |
+
print(f"DataFrame saved to {output_csv}")
|
src/models_info.py
ADDED
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#feel free to correct these categories, I think size should also be added
|
2 |
+
model_info = {
|
3 |
+
"meta-llama-Meta-Llama-3-70B": {
|
4 |
+
"link": "https://huggingface.co/meta-llama/Meta-Llama-3-70B",
|
5 |
+
"tuning": "π’" # Pre-trained
|
6 |
+
},
|
7 |
+
"meta-llama-Meta-Llama-3-8B": {
|
8 |
+
"link": "https://huggingface.co/meta-llama/Meta-Llama-3-8B",
|
9 |
+
"tuning": "π’" # Pre-trained
|
10 |
+
},
|
11 |
+
"01-ai-Yi-1.5-34B": {
|
12 |
+
"link": "https://huggingface.co/01-ai/Yi-1.5-34B",
|
13 |
+
"tuning": "πΆ" # Fine-tuned on task specific dataset
|
14 |
+
},
|
15 |
+
"aaditya-Llama3-OpenBioLLM-70B": {
|
16 |
+
"link": "https://huggingface.co/aaditya/Llama3-OpenBioLLM-70B",
|
17 |
+
"tuning": "πΆ" # Fine-tuned on task specific dataset
|
18 |
+
},
|
19 |
+
"CohereForAI-aya-23-35B": {
|
20 |
+
"link": "https://huggingface.co/CohereForAI/aya-23-35B",
|
21 |
+
"tuning": "πΆ" # Fine-tuned on task specific dataset
|
22 |
+
},
|
23 |
+
"CohereForAI-c4ai-command-r-plus": {
|
24 |
+
"link": "https://huggingface.co/CohereForAI/c4ai-command-r-plus",
|
25 |
+
"tuning": "π¬" # Chat-model (RLHF, DPO, IFT, etc.)
|
26 |
+
},
|
27 |
+
"johnsnowlabs-JSL-MedLlama-3-8B-v9": {
|
28 |
+
"link": "https://huggingface.co/johnsnowlabs/JSL-MedLlama-3-8B-v9",
|
29 |
+
"tuning": "πΆ" # Fine-tuned on task specific dataset
|
30 |
+
},
|
31 |
+
"meta-llama-Llama-2-70B-hf": {
|
32 |
+
"link": "https://huggingface.co/meta-llama/Llama-2-70B-hf",
|
33 |
+
"tuning": "π’" # Pre-trained
|
34 |
+
},
|
35 |
+
"meta-llama-Llama-2-7b-hf": {
|
36 |
+
"link": "https://huggingface.co/meta-llama/Llama-2-7b-hf",
|
37 |
+
"tuning": "π’" # Pre-trained
|
38 |
+
},
|
39 |
+
"microsoft-phi-1_5": {
|
40 |
+
"link": "https://huggingface.co/microsoft/phi-1_5",
|
41 |
+
"tuning": "π’" # Pre-trained
|
42 |
+
},
|
43 |
+
"microsoft-phi-1": {
|
44 |
+
"link": "https://huggingface.co/microsoft/phi-1",
|
45 |
+
"tuning": "π’" # Pre-trained
|
46 |
+
},
|
47 |
+
"microsoft-phi-2": {
|
48 |
+
"link": "https://huggingface.co/microsoft/phi-2",
|
49 |
+
"tuning": "π’" # Pre-trained
|
50 |
+
},
|
51 |
+
"microsoft-Phi-3-medium-4k-instruct": {
|
52 |
+
"link": "https://huggingface.co/microsoft/Phi-3-medium-4k-instruct",
|
53 |
+
"tuning": "π¬" # Chat-model (RLHF, DPO, IFT, etc.)
|
54 |
+
},
|
55 |
+
"mistralai-Mistral-7B-v0.3": {
|
56 |
+
"link": "https://huggingface.co/mistralai/Mistral-7B-v0.3",
|
57 |
+
"tuning": "π’" # Continuously pre-trained
|
58 |
+
},
|
59 |
+
"mistralai-Mixtral-8x22B-v0.1": {
|
60 |
+
"link": "https://huggingface.co/mistralai/Mixtral-8x22B-v0.1",
|
61 |
+
"tuning": "π’" # Continuously pre-trained
|
62 |
+
},
|
63 |
+
"mistralai-Mixtral-8x7B-v0.1": {
|
64 |
+
"link": "https://huggingface.co/mistralai/Mixtral-8x7B-v0.1",
|
65 |
+
"tuning": "π’" # Continuously pre-trained
|
66 |
+
},
|
67 |
+
"ProbeMedicalYonseiMAILab-medllama3-v20": {
|
68 |
+
"link": "https://huggingface.co/ProbeMedicalYonseiMAILab/medllama3-v20",
|
69 |
+
"tuning": "πΆ" # Fine-tuned on task specific dataset
|
70 |
+
},
|
71 |
+
"Qwen-Qwen2-72B": {
|
72 |
+
"link": "https://huggingface.co/Qwen/Qwen2-72B",
|
73 |
+
"tuning": "π’" # Pre-trained
|
74 |
+
},
|
75 |
+
"Qwen-Qwen2-7B": {
|
76 |
+
"link": "https://huggingface.co/Qwen/Qwen2-7B",
|
77 |
+
"tuning": "π’" # Pre-trained
|
78 |
+
},
|
79 |
+
}
|