Bram Vanroy
commited on
Commit
•
863e074
1
Parent(s):
095087c
update with only Dutch
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- app.py +54 -60
- css.py +2 -2
- evals/arc/arc_ca-bloom-7b1.json +0 -23
- evals/arc/arc_ca-llama-7B.json +0 -23
- evals/arc/arc_da-bloom-7b1.json +0 -23
- evals/arc/arc_da-llama-7B.json +0 -23
- evals/arc/arc_de-bloom-7b1.json +0 -23
- evals/arc/arc_de-llama-7B.json +0 -23
- evals/arc/arc_es-bloom-7b1.json +0 -23
- evals/arc/arc_es-llama-7B.json +0 -23
- evals/arc/arc_eu-bloom-7b1.json +0 -23
- evals/arc/arc_eu-llama-7B.json +0 -23
- evals/arc/arc_fr-bloom-7b1.json +0 -23
- evals/arc/arc_fr-llama-7B.json +0 -23
- evals/arc/arc_gu-bloom-7b1.json +0 -23
- evals/arc/arc_gu-llama-7B.json +0 -23
- evals/arc/arc_hi-bloom-7b1.json +0 -23
- evals/arc/arc_hi-llama-7B.json +0 -23
- evals/arc/arc_hr-bloom-7b1.json +0 -23
- evals/arc/arc_hr-llama-7B.json +0 -23
- evals/arc/arc_hu-bloom-7b1.json +0 -23
- evals/arc/arc_hu-llama-7B.json +0 -23
- evals/arc/arc_hy-bloom-7b1.json +0 -23
- evals/arc/arc_hy-llama-7B.json +0 -23
- evals/arc/arc_id-bloom-7b1.json +0 -23
- evals/arc/arc_id-llama-7B.json +0 -23
- evals/arc/arc_it-bloom-7b1.json +0 -23
- evals/arc/arc_it-llama-7B.json +0 -23
- evals/arc/arc_kn-bloom-7b1.json +0 -23
- evals/arc/arc_kn-llama-7B.json +0 -23
- evals/arc/arc_ml-bloom-7b1.json +0 -23
- evals/arc/arc_ml-llama-7B.json +0 -23
- evals/arc/arc_mr-bloom-7b1.json +0 -23
- evals/arc/arc_mr-llama-7B.json +0 -23
- evals/arc/arc_ne-bloom-7b1.json +0 -23
- evals/arc/arc_ne-llama-7B.json +0 -23
- evals/arc/arc_nl_Llama-2-7b-chat-hf.json +23 -0
- evals/{mmlu/mmlu_gu-bloom-7b1.json → arc/arc_nl_Llama-2-7b-hf.json} +8 -8
- evals/arc/{arc_ar-llama-7B.json → arc_nl_Mistral-7B-v0.1.json} +8 -8
- evals/arc/{arc_bn-bloom-7b1.json → arc_nl_zephyr-7b-beta.json} +8 -8
- evals/arc/arc_pt-bloom-7b1.json +0 -23
- evals/arc/arc_pt-llama-7B.json +0 -23
- evals/arc/arc_ro-bloom-7b1.json +0 -23
- evals/arc/arc_ro-llama-7B.json +0 -23
- evals/arc/arc_ru-bloom-7b1.json +0 -23
- evals/arc/arc_ru-llama-7B.json +0 -23
- evals/arc/arc_sk-bloom-7b1.json +0 -23
- evals/arc/arc_sk-llama-7B.json +0 -23
- evals/arc/arc_sr-bloom-7b1.json +0 -23
- evals/arc/arc_sr-llama-7B.json +0 -23
app.py
CHANGED
@@ -1,9 +1,10 @@
|
|
1 |
-
import os
|
2 |
import json
|
3 |
-
import glob
|
4 |
from collections import defaultdict
|
|
|
|
|
5 |
import pandas as pd
|
6 |
import gradio as gr
|
|
|
7 |
from content import *
|
8 |
from css import *
|
9 |
import glob
|
@@ -16,74 +17,74 @@ BENCHMARKS = [ARC, HELLASWAG, MMLU, TRUTHFULQA]
|
|
16 |
|
17 |
METRICS = ["acc_norm", "acc_norm", "acc_norm", "mc2"]
|
18 |
|
19 |
-
LANGS =
|
20 |
|
21 |
LANG_NAME = {
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
}
|
54 |
|
55 |
|
56 |
def collect_results():
|
57 |
performance_dict = defaultdict(dict)
|
58 |
pretrained_models = set()
|
59 |
-
for
|
60 |
-
|
61 |
-
|
62 |
-
if 'results' not in data:
|
63 |
continue
|
64 |
-
if
|
65 |
continue
|
66 |
-
results = data[
|
67 |
-
config = data[
|
68 |
-
if
|
69 |
continue
|
70 |
|
71 |
-
model_args = config[
|
72 |
-
pretrained = [x for x in model_args if x.startswith(
|
73 |
if len(pretrained) != 1:
|
74 |
continue
|
75 |
-
pretrained = pretrained[0].split(
|
76 |
-
pretrained = pretrained.split(
|
77 |
pretrained_models.add(pretrained)
|
78 |
|
79 |
for lang_task, perfs in results.items():
|
80 |
-
task, lang = lang_task.split(
|
81 |
assert task in BENCHMARKS
|
82 |
|
83 |
if lang and task:
|
84 |
metric = METRICS[BENCHMARKS.index(task)]
|
85 |
p = round(perfs[metric] * 100, 1)
|
86 |
performance_dict[(pretrained, lang)][task] = p
|
|
|
87 |
return performance_dict, pretrained_models
|
88 |
|
89 |
|
@@ -96,15 +97,13 @@ def get_leaderboard_df(performance_dict, pretrained_models):
|
|
96 |
mmlu_perf = perfs.get(MMLU, 0.0)
|
97 |
truthfulqa_perf = perfs.get(TRUTHFULQA, 0.0)
|
98 |
|
99 |
-
if arc_perf * hellaswag_perf * mmlu_perf * truthfulqa_perf == 0:
|
100 |
-
continue
|
101 |
avg = round((arc_perf + hellaswag_perf + mmlu_perf + truthfulqa_perf) / 4, 1)
|
102 |
-
notes =
|
103 |
-
row = [pretrained,
|
104 |
df.append(row)
|
105 |
|
106 |
df = pd.DataFrame.from_records(df, columns=COLS)
|
107 |
-
df = df.sort_values(by=[
|
108 |
df = df[COLS]
|
109 |
|
110 |
return df
|
@@ -115,10 +114,7 @@ def search_table(df, query):
|
|
115 |
return filtered_df
|
116 |
|
117 |
|
118 |
-
|
119 |
MODEL_COL = "Model"
|
120 |
-
LANG_COL = "Language"
|
121 |
-
CODE_COL = "Code"
|
122 |
AVERAGE_COL = "Average"
|
123 |
ARC_COL = "ARC (25-shot)"
|
124 |
HELLASWAG_COL = "HellaSwag (10-shot)️"
|
@@ -126,8 +122,8 @@ MMLU_COL = "MMLU (5-shot)"
|
|
126 |
TRUTHFULQA_COL = "TruthfulQA (0-shot)"
|
127 |
NOTES_COL = "Notes" # For search only
|
128 |
|
129 |
-
COLS = [MODEL_COL,
|
130 |
-
TYPES = ["str", "
|
131 |
|
132 |
args = collect_results()
|
133 |
original_df = get_leaderboard_df(*args)
|
@@ -139,9 +135,7 @@ with demo:
|
|
139 |
gr.Markdown(HOW_TO, elem_classes="markdown-text")
|
140 |
|
141 |
with gr.Box():
|
142 |
-
search_bar = gr.Textbox(
|
143 |
-
placeholder="Search models and languages...", show_label=False, elem_id="search-bar"
|
144 |
-
)
|
145 |
|
146 |
leaderboard_table = gr.components.Dataframe(
|
147 |
value=original_df,
|
|
|
|
|
1 |
import json
|
|
|
2 |
from collections import defaultdict
|
3 |
+
from pathlib import Path
|
4 |
+
|
5 |
import pandas as pd
|
6 |
import gradio as gr
|
7 |
+
|
8 |
from content import *
|
9 |
from css import *
|
10 |
import glob
|
|
|
17 |
|
18 |
METRICS = ["acc_norm", "acc_norm", "acc_norm", "mc2"]
|
19 |
|
20 |
+
LANGS = "ar,bn,ca,da,de,es,eu,fr,gu,hi,hr,hu,hy,id,it,kn,ml,mr,ne,nl,pt,ro,ru,sk,sr,sv,ta,te,uk,vi,zh".split(",")
|
21 |
|
22 |
LANG_NAME = {
|
23 |
+
"ar": "Arabic",
|
24 |
+
"bn": "Bengali",
|
25 |
+
"ca": "Catalan",
|
26 |
+
"da": "Danish",
|
27 |
+
"de": "German",
|
28 |
+
"es": "Spanish",
|
29 |
+
"eu": "Basque",
|
30 |
+
"fr": "French",
|
31 |
+
"gu": "Gujarati",
|
32 |
+
"hi": "Hindi",
|
33 |
+
"hr": "Croatian",
|
34 |
+
"hu": "Hungarian",
|
35 |
+
"hy": "Armenian",
|
36 |
+
"id": "Indonesian",
|
37 |
+
"it": "Italian",
|
38 |
+
"kn": "Kannada",
|
39 |
+
"ml": "Malayalam",
|
40 |
+
"mr": "Marathi",
|
41 |
+
"ne": "Nepali",
|
42 |
+
"nl": "Dutch",
|
43 |
+
"pt": "Portuguese",
|
44 |
+
"ro": "Romanian",
|
45 |
+
"ru": "Russian",
|
46 |
+
"sk": "Slovak",
|
47 |
+
"sr": "Serbian",
|
48 |
+
"sv": "Swedish",
|
49 |
+
"ta": "Tamil",
|
50 |
+
"te": "Telugu",
|
51 |
+
"uk": "Ukrainian",
|
52 |
+
"vi": "Vietnamese",
|
53 |
+
"zh": "Chinese",
|
54 |
}
|
55 |
|
56 |
|
57 |
def collect_results():
|
58 |
performance_dict = defaultdict(dict)
|
59 |
pretrained_models = set()
|
60 |
+
for pfin in Path("evals").rglob("*.json"):
|
61 |
+
data = json.loads(pfin.read_text(encoding="utf-8"))
|
62 |
+
if "results" not in data:
|
|
|
63 |
continue
|
64 |
+
if "config" not in data:
|
65 |
continue
|
66 |
+
results = data["results"]
|
67 |
+
config = data["config"]
|
68 |
+
if "model_args" not in config:
|
69 |
continue
|
70 |
|
71 |
+
model_args = config["model_args"].split(",")
|
72 |
+
pretrained = [x for x in model_args if x.startswith("pretrained=")]
|
73 |
if len(pretrained) != 1:
|
74 |
continue
|
75 |
+
pretrained = pretrained[0].split("=")[1]
|
76 |
+
pretrained = pretrained.split("/")[-1]
|
77 |
pretrained_models.add(pretrained)
|
78 |
|
79 |
for lang_task, perfs in results.items():
|
80 |
+
task, lang = lang_task.split("_")
|
81 |
assert task in BENCHMARKS
|
82 |
|
83 |
if lang and task:
|
84 |
metric = METRICS[BENCHMARKS.index(task)]
|
85 |
p = round(perfs[metric] * 100, 1)
|
86 |
performance_dict[(pretrained, lang)][task] = p
|
87 |
+
|
88 |
return performance_dict, pretrained_models
|
89 |
|
90 |
|
|
|
97 |
mmlu_perf = perfs.get(MMLU, 0.0)
|
98 |
truthfulqa_perf = perfs.get(TRUTHFULQA, 0.0)
|
99 |
|
|
|
|
|
100 |
avg = round((arc_perf + hellaswag_perf + mmlu_perf + truthfulqa_perf) / 4, 1)
|
101 |
+
notes = " ".join([pretrained, lang_name])
|
102 |
+
row = [pretrained, avg, arc_perf, hellaswag_perf, mmlu_perf, truthfulqa_perf, notes]
|
103 |
df.append(row)
|
104 |
|
105 |
df = pd.DataFrame.from_records(df, columns=COLS)
|
106 |
+
df = df.sort_values(by=[AVERAGE_COL], ascending=False)
|
107 |
df = df[COLS]
|
108 |
|
109 |
return df
|
|
|
114 |
return filtered_df
|
115 |
|
116 |
|
|
|
117 |
MODEL_COL = "Model"
|
|
|
|
|
118 |
AVERAGE_COL = "Average"
|
119 |
ARC_COL = "ARC (25-shot)"
|
120 |
HELLASWAG_COL = "HellaSwag (10-shot)️"
|
|
|
122 |
TRUTHFULQA_COL = "TruthfulQA (0-shot)"
|
123 |
NOTES_COL = "Notes" # For search only
|
124 |
|
125 |
+
COLS = [MODEL_COL, AVERAGE_COL, ARC_COL, HELLASWAG_COL, MMLU_COL, TRUTHFULQA_COL, NOTES_COL]
|
126 |
+
TYPES = ["str", "number", "number", "number", "number", "number", "str"]
|
127 |
|
128 |
args = collect_results()
|
129 |
original_df = get_leaderboard_df(*args)
|
|
|
135 |
gr.Markdown(HOW_TO, elem_classes="markdown-text")
|
136 |
|
137 |
with gr.Box():
|
138 |
+
search_bar = gr.Textbox(placeholder="Search models and languages...", show_label=False, elem_id="search-bar")
|
|
|
|
|
139 |
|
140 |
leaderboard_table = gr.components.Dataframe(
|
141 |
value=original_df,
|
css.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
-
CUSTOM_CSS= """
|
2 |
/* Hides the final column */
|
3 |
table td:last-child,
|
4 |
table th:last-child {
|
@@ -10,4 +10,4 @@ table th:last-child {
|
|
10 |
# overflow: auto;
|
11 |
# white-space: nowrap;
|
12 |
# }
|
13 |
-
"""
|
|
|
1 |
+
CUSTOM_CSS = """
|
2 |
/* Hides the final column */
|
3 |
table td:last-child,
|
4 |
table th:last-child {
|
|
|
10 |
# overflow: auto;
|
11 |
# white-space: nowrap;
|
12 |
# }
|
13 |
+
"""
|
evals/arc/arc_ca-bloom-7b1.json
DELETED
@@ -1,23 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"arc_ca": {
|
4 |
-
"acc": 0.31989708404802747,
|
5 |
-
"acc_stderr": 0.01366562491926326,
|
6 |
-
"acc_norm": 0.34734133790737565,
|
7 |
-
"acc_norm_stderr": 0.013949489903701517
|
8 |
-
}
|
9 |
-
},
|
10 |
-
"versions": {
|
11 |
-
"arc_ca": 0
|
12 |
-
},
|
13 |
-
"config": {
|
14 |
-
"model": "hf-auto",
|
15 |
-
"model_args": "pretrained=bigscience/bloom-7b1",
|
16 |
-
"batch_size": 1,
|
17 |
-
"device": "cuda",
|
18 |
-
"no_cache": false,
|
19 |
-
"limit": null,
|
20 |
-
"bootstrap_iters": 100000,
|
21 |
-
"description_dict": {}
|
22 |
-
}
|
23 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evals/arc/arc_ca-llama-7B.json
DELETED
@@ -1,23 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"arc_ca": {
|
4 |
-
"acc": 0.3276157804459691,
|
5 |
-
"acc_stderr": 0.01375080741597368,
|
6 |
-
"acc_norm": 0.3507718696397942,
|
7 |
-
"acc_norm_stderr": 0.013981316936172217
|
8 |
-
}
|
9 |
-
},
|
10 |
-
"versions": {
|
11 |
-
"arc_ca": 0
|
12 |
-
},
|
13 |
-
"config": {
|
14 |
-
"model": "hf-auto",
|
15 |
-
"model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
|
16 |
-
"batch_size": 1,
|
17 |
-
"device": "cuda",
|
18 |
-
"no_cache": false,
|
19 |
-
"limit": null,
|
20 |
-
"bootstrap_iters": 100000,
|
21 |
-
"description_dict": {}
|
22 |
-
}
|
23 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evals/arc/arc_da-bloom-7b1.json
DELETED
@@ -1,23 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"arc_da": {
|
4 |
-
"acc": 0.20137103684661525,
|
5 |
-
"acc_stderr": 0.011744154502532795,
|
6 |
-
"acc_norm": 0.24592973436161097,
|
7 |
-
"acc_norm_stderr": 0.012611366681285752
|
8 |
-
}
|
9 |
-
},
|
10 |
-
"versions": {
|
11 |
-
"arc_da": 0
|
12 |
-
},
|
13 |
-
"config": {
|
14 |
-
"model": "hf-auto",
|
15 |
-
"model_args": "pretrained=bigscience/bloom-7b1",
|
16 |
-
"batch_size": 1,
|
17 |
-
"device": "cuda",
|
18 |
-
"no_cache": false,
|
19 |
-
"limit": null,
|
20 |
-
"bootstrap_iters": 100000,
|
21 |
-
"description_dict": {}
|
22 |
-
}
|
23 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evals/arc/arc_da-llama-7B.json
DELETED
@@ -1,23 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"arc_da": {
|
4 |
-
"acc": 0.286203941730934,
|
5 |
-
"acc_stderr": 0.013236574332463879,
|
6 |
-
"acc_norm": 0.3273350471293916,
|
7 |
-
"acc_norm_stderr": 0.013741887176251822
|
8 |
-
}
|
9 |
-
},
|
10 |
-
"versions": {
|
11 |
-
"arc_da": 0
|
12 |
-
},
|
13 |
-
"config": {
|
14 |
-
"model": "hf-auto",
|
15 |
-
"model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
|
16 |
-
"batch_size": 1,
|
17 |
-
"device": "cuda",
|
18 |
-
"no_cache": false,
|
19 |
-
"limit": null,
|
20 |
-
"bootstrap_iters": 100000,
|
21 |
-
"description_dict": {}
|
22 |
-
}
|
23 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evals/arc/arc_de-bloom-7b1.json
DELETED
@@ -1,23 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"arc_de": {
|
4 |
-
"acc": 0.22241231822070145,
|
5 |
-
"acc_stderr": 0.012168377742629776,
|
6 |
-
"acc_norm": 0.262617621899059,
|
7 |
-
"acc_norm_stderr": 0.01287617552045283
|
8 |
-
}
|
9 |
-
},
|
10 |
-
"versions": {
|
11 |
-
"arc_de": 0
|
12 |
-
},
|
13 |
-
"config": {
|
14 |
-
"model": "hf-auto",
|
15 |
-
"model_args": "pretrained=bigscience/bloom-7b1",
|
16 |
-
"batch_size": 1,
|
17 |
-
"device": "cuda",
|
18 |
-
"no_cache": false,
|
19 |
-
"limit": null,
|
20 |
-
"bootstrap_iters": 100000,
|
21 |
-
"description_dict": {}
|
22 |
-
}
|
23 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evals/arc/arc_de-llama-7B.json
DELETED
@@ -1,23 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"arc_de": {
|
4 |
-
"acc": 0.2951240376390077,
|
5 |
-
"acc_stderr": 0.013345572865502645,
|
6 |
-
"acc_norm": 0.35072711719418304,
|
7 |
-
"acc_norm_stderr": 0.013962940383743043
|
8 |
-
}
|
9 |
-
},
|
10 |
-
"versions": {
|
11 |
-
"arc_de": 0
|
12 |
-
},
|
13 |
-
"config": {
|
14 |
-
"model": "hf-auto",
|
15 |
-
"model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
|
16 |
-
"batch_size": 1,
|
17 |
-
"device": "cuda",
|
18 |
-
"no_cache": false,
|
19 |
-
"limit": null,
|
20 |
-
"bootstrap_iters": 100000,
|
21 |
-
"description_dict": {}
|
22 |
-
}
|
23 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evals/arc/arc_es-bloom-7b1.json
DELETED
@@ -1,23 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"arc_es": {
|
4 |
-
"acc": 0.3316239316239316,
|
5 |
-
"acc_stderr": 0.013769752111910177,
|
6 |
-
"acc_norm": 0.3811965811965812,
|
7 |
-
"acc_norm_stderr": 0.01420507709573084
|
8 |
-
}
|
9 |
-
},
|
10 |
-
"versions": {
|
11 |
-
"arc_es": 0
|
12 |
-
},
|
13 |
-
"config": {
|
14 |
-
"model": "hf-auto",
|
15 |
-
"model_args": "pretrained=bigscience/bloom-7b1",
|
16 |
-
"batch_size": 1,
|
17 |
-
"device": "cuda",
|
18 |
-
"no_cache": false,
|
19 |
-
"limit": null,
|
20 |
-
"bootstrap_iters": 100000,
|
21 |
-
"description_dict": {}
|
22 |
-
}
|
23 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evals/arc/arc_es-llama-7B.json
DELETED
@@ -1,23 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"arc_es": {
|
4 |
-
"acc": 0.3606837606837607,
|
5 |
-
"acc_stderr": 0.014044746572948867,
|
6 |
-
"acc_norm": 0.3683760683760684,
|
7 |
-
"acc_norm_stderr": 0.014108074259155369
|
8 |
-
}
|
9 |
-
},
|
10 |
-
"versions": {
|
11 |
-
"arc_es": 0
|
12 |
-
},
|
13 |
-
"config": {
|
14 |
-
"model": "hf-auto",
|
15 |
-
"model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
|
16 |
-
"batch_size": 1,
|
17 |
-
"device": "cuda",
|
18 |
-
"no_cache": false,
|
19 |
-
"limit": null,
|
20 |
-
"bootstrap_iters": 100000,
|
21 |
-
"description_dict": {}
|
22 |
-
}
|
23 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evals/arc/arc_eu-bloom-7b1.json
DELETED
@@ -1,23 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"arc_eu": {
|
4 |
-
"acc": 0.22056239015817222,
|
5 |
-
"acc_stderr": 0.01229634886589257,
|
6 |
-
"acc_norm": 0.2521968365553603,
|
7 |
-
"acc_norm_stderr": 0.012879032347922939
|
8 |
-
}
|
9 |
-
},
|
10 |
-
"versions": {
|
11 |
-
"arc_eu": 0
|
12 |
-
},
|
13 |
-
"config": {
|
14 |
-
"model": "hf-auto",
|
15 |
-
"model_args": "pretrained=bigscience/bloom-7b1",
|
16 |
-
"batch_size": 1,
|
17 |
-
"device": "cuda",
|
18 |
-
"no_cache": false,
|
19 |
-
"limit": null,
|
20 |
-
"bootstrap_iters": 100000,
|
21 |
-
"description_dict": {}
|
22 |
-
}
|
23 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evals/arc/arc_eu-llama-7B.json
DELETED
@@ -1,23 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"arc_eu": {
|
4 |
-
"acc": 0.20738137082601055,
|
5 |
-
"acc_stderr": 0.012023662461166562,
|
6 |
-
"acc_norm": 0.2451669595782074,
|
7 |
-
"acc_norm_stderr": 0.012757811738008544
|
8 |
-
}
|
9 |
-
},
|
10 |
-
"versions": {
|
11 |
-
"arc_eu": 0
|
12 |
-
},
|
13 |
-
"config": {
|
14 |
-
"model": "hf-auto",
|
15 |
-
"model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
|
16 |
-
"batch_size": 1,
|
17 |
-
"device": "cuda",
|
18 |
-
"no_cache": false,
|
19 |
-
"limit": null,
|
20 |
-
"bootstrap_iters": 100000,
|
21 |
-
"description_dict": {}
|
22 |
-
}
|
23 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evals/arc/arc_fr-bloom-7b1.json
DELETED
@@ -1,23 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"arc_fr": {
|
4 |
-
"acc": 0.32677502138579984,
|
5 |
-
"acc_stderr": 0.01372407602199982,
|
6 |
-
"acc_norm": 0.3669803250641574,
|
7 |
-
"acc_norm_stderr": 0.014102904772197396
|
8 |
-
}
|
9 |
-
},
|
10 |
-
"versions": {
|
11 |
-
"arc_fr": 0
|
12 |
-
},
|
13 |
-
"config": {
|
14 |
-
"model": "hf-auto",
|
15 |
-
"model_args": "pretrained=bigscience/bloom-7b1",
|
16 |
-
"batch_size": 1,
|
17 |
-
"device": "cuda",
|
18 |
-
"no_cache": false,
|
19 |
-
"limit": null,
|
20 |
-
"bootstrap_iters": 100000,
|
21 |
-
"description_dict": {}
|
22 |
-
}
|
23 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evals/arc/arc_fr-llama-7B.json
DELETED
@@ -1,23 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"arc_fr": {
|
4 |
-
"acc": 0.3473053892215569,
|
5 |
-
"acc_stderr": 0.013931226499492353,
|
6 |
-
"acc_norm": 0.3729683490162532,
|
7 |
-
"acc_norm_stderr": 0.014150093168782438
|
8 |
-
}
|
9 |
-
},
|
10 |
-
"versions": {
|
11 |
-
"arc_fr": 0
|
12 |
-
},
|
13 |
-
"config": {
|
14 |
-
"model": "hf-auto",
|
15 |
-
"model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
|
16 |
-
"batch_size": 1,
|
17 |
-
"device": "cuda",
|
18 |
-
"no_cache": false,
|
19 |
-
"limit": null,
|
20 |
-
"bootstrap_iters": 100000,
|
21 |
-
"description_dict": {}
|
22 |
-
}
|
23 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evals/arc/arc_gu-bloom-7b1.json
DELETED
@@ -1,23 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"arc_gu": {
|
4 |
-
"acc": 0.2206896551724138,
|
5 |
-
"acc_stderr": 0.012181604374453973,
|
6 |
-
"acc_norm": 0.2336206896551724,
|
7 |
-
"acc_norm_stderr": 0.012428989430945793
|
8 |
-
}
|
9 |
-
},
|
10 |
-
"versions": {
|
11 |
-
"arc_gu": 0
|
12 |
-
},
|
13 |
-
"config": {
|
14 |
-
"model": "hf-auto",
|
15 |
-
"model_args": "pretrained=bigscience/bloom-7b1",
|
16 |
-
"batch_size": 1,
|
17 |
-
"device": "cuda",
|
18 |
-
"no_cache": false,
|
19 |
-
"limit": null,
|
20 |
-
"bootstrap_iters": 100000,
|
21 |
-
"description_dict": {}
|
22 |
-
}
|
23 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evals/arc/arc_gu-llama-7B.json
DELETED
@@ -1,23 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"arc_gu": {
|
4 |
-
"acc": 0.2120689655172414,
|
5 |
-
"acc_stderr": 0.012007177871292825,
|
6 |
-
"acc_norm": 0.23189655172413792,
|
7 |
-
"acc_norm_stderr": 0.012396962423413033
|
8 |
-
}
|
9 |
-
},
|
10 |
-
"versions": {
|
11 |
-
"arc_gu": 0
|
12 |
-
},
|
13 |
-
"config": {
|
14 |
-
"model": "hf-auto",
|
15 |
-
"model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
|
16 |
-
"batch_size": 1,
|
17 |
-
"device": "cuda",
|
18 |
-
"no_cache": false,
|
19 |
-
"limit": null,
|
20 |
-
"bootstrap_iters": 100000,
|
21 |
-
"description_dict": {}
|
22 |
-
}
|
23 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evals/arc/arc_hi-bloom-7b1.json
DELETED
@@ -1,23 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"arc_hi": {
|
4 |
-
"acc": 0.2363013698630137,
|
5 |
-
"acc_stderr": 0.012435369590403731,
|
6 |
-
"acc_norm": 0.2919520547945205,
|
7 |
-
"acc_norm_stderr": 0.013309191484613488
|
8 |
-
}
|
9 |
-
},
|
10 |
-
"versions": {
|
11 |
-
"arc_hi": 0
|
12 |
-
},
|
13 |
-
"config": {
|
14 |
-
"model": "hf-auto",
|
15 |
-
"model_args": "pretrained=bigscience/bloom-7b1",
|
16 |
-
"batch_size": 1,
|
17 |
-
"device": "cuda",
|
18 |
-
"no_cache": false,
|
19 |
-
"limit": null,
|
20 |
-
"bootstrap_iters": 100000,
|
21 |
-
"description_dict": {}
|
22 |
-
}
|
23 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evals/arc/arc_hi-llama-7B.json
DELETED
@@ -1,23 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"arc_hi": {
|
4 |
-
"acc": 0.21232876712328766,
|
5 |
-
"acc_stderr": 0.011971304657273123,
|
6 |
-
"acc_norm": 0.25,
|
7 |
-
"acc_norm_stderr": 0.012675503164084846
|
8 |
-
}
|
9 |
-
},
|
10 |
-
"versions": {
|
11 |
-
"arc_hi": 0
|
12 |
-
},
|
13 |
-
"config": {
|
14 |
-
"model": "hf-auto",
|
15 |
-
"model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
|
16 |
-
"batch_size": 1,
|
17 |
-
"device": "cuda",
|
18 |
-
"no_cache": false,
|
19 |
-
"limit": null,
|
20 |
-
"bootstrap_iters": 100000,
|
21 |
-
"description_dict": {}
|
22 |
-
}
|
23 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evals/arc/arc_hr-bloom-7b1.json
DELETED
@@ -1,23 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"arc_hr": {
|
4 |
-
"acc": 0.19332763045337895,
|
5 |
-
"acc_stderr": 0.011555111310342437,
|
6 |
-
"acc_norm": 0.2369546621043627,
|
7 |
-
"acc_norm_stderr": 0.012441890624187792
|
8 |
-
}
|
9 |
-
},
|
10 |
-
"versions": {
|
11 |
-
"arc_hr": 0
|
12 |
-
},
|
13 |
-
"config": {
|
14 |
-
"model": "hf-auto",
|
15 |
-
"model_args": "pretrained=bigscience/bloom-7b1",
|
16 |
-
"batch_size": 1,
|
17 |
-
"device": "cuda",
|
18 |
-
"no_cache": false,
|
19 |
-
"limit": null,
|
20 |
-
"bootstrap_iters": 100000,
|
21 |
-
"description_dict": {}
|
22 |
-
}
|
23 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evals/arc/arc_hr-llama-7B.json
DELETED
@@ -1,23 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"arc_hr": {
|
4 |
-
"acc": 0.2754491017964072,
|
5 |
-
"acc_stderr": 0.01307174925264165,
|
6 |
-
"acc_norm": 0.330196749358426,
|
7 |
-
"acc_norm_stderr": 0.013760638974726852
|
8 |
-
}
|
9 |
-
},
|
10 |
-
"versions": {
|
11 |
-
"arc_hr": 0
|
12 |
-
},
|
13 |
-
"config": {
|
14 |
-
"model": "hf-auto",
|
15 |
-
"model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
|
16 |
-
"batch_size": 1,
|
17 |
-
"device": "cuda",
|
18 |
-
"no_cache": false,
|
19 |
-
"limit": null,
|
20 |
-
"bootstrap_iters": 100000,
|
21 |
-
"description_dict": {}
|
22 |
-
}
|
23 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evals/arc/arc_hu-bloom-7b1.json
DELETED
@@ -1,23 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"arc_hu": {
|
4 |
-
"acc": 0.1969178082191781,
|
5 |
-
"acc_stderr": 0.011640913614197496,
|
6 |
-
"acc_norm": 0.2585616438356164,
|
7 |
-
"acc_norm_stderr": 0.0128169339627777
|
8 |
-
}
|
9 |
-
},
|
10 |
-
"versions": {
|
11 |
-
"arc_hu": 0
|
12 |
-
},
|
13 |
-
"config": {
|
14 |
-
"model": "hf-auto",
|
15 |
-
"model_args": "pretrained=bigscience/bloom-7b1",
|
16 |
-
"batch_size": 1,
|
17 |
-
"device": "cuda",
|
18 |
-
"no_cache": false,
|
19 |
-
"limit": null,
|
20 |
-
"bootstrap_iters": 100000,
|
21 |
-
"description_dict": {}
|
22 |
-
}
|
23 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evals/arc/arc_hu-llama-7B.json
DELETED
@@ -1,23 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"arc_hu": {
|
4 |
-
"acc": 0.2517123287671233,
|
5 |
-
"acc_stderr": 0.012704310825494622,
|
6 |
-
"acc_norm": 0.2979452054794521,
|
7 |
-
"acc_norm_stderr": 0.013388079339102703
|
8 |
-
}
|
9 |
-
},
|
10 |
-
"versions": {
|
11 |
-
"arc_hu": 0
|
12 |
-
},
|
13 |
-
"config": {
|
14 |
-
"model": "hf-auto",
|
15 |
-
"model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
|
16 |
-
"batch_size": 1,
|
17 |
-
"device": "cuda",
|
18 |
-
"no_cache": false,
|
19 |
-
"limit": null,
|
20 |
-
"bootstrap_iters": 100000,
|
21 |
-
"description_dict": {}
|
22 |
-
}
|
23 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evals/arc/arc_hy-bloom-7b1.json
DELETED
@@ -1,23 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"arc_hy": {
|
4 |
-
"acc": 0.21181818181818182,
|
5 |
-
"acc_stderr": 0.01232525683396216,
|
6 |
-
"acc_norm": 0.26181818181818184,
|
7 |
-
"acc_norm_stderr": 0.013261197012809796
|
8 |
-
}
|
9 |
-
},
|
10 |
-
"versions": {
|
11 |
-
"arc_hy": 0
|
12 |
-
},
|
13 |
-
"config": {
|
14 |
-
"model": "hf-auto",
|
15 |
-
"model_args": "pretrained=bigscience/bloom-7b1",
|
16 |
-
"batch_size": 1,
|
17 |
-
"device": "cuda",
|
18 |
-
"no_cache": false,
|
19 |
-
"limit": null,
|
20 |
-
"bootstrap_iters": 100000,
|
21 |
-
"description_dict": {}
|
22 |
-
}
|
23 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evals/arc/arc_hy-llama-7B.json
DELETED
@@ -1,23 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"arc_hy": {
|
4 |
-
"acc": 0.19454545454545455,
|
5 |
-
"acc_stderr": 0.011940766785664334,
|
6 |
-
"acc_norm": 0.2718181818181818,
|
7 |
-
"acc_norm_stderr": 0.013420241182110736
|
8 |
-
}
|
9 |
-
},
|
10 |
-
"versions": {
|
11 |
-
"arc_hy": 0
|
12 |
-
},
|
13 |
-
"config": {
|
14 |
-
"model": "hf-auto",
|
15 |
-
"model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
|
16 |
-
"batch_size": 1,
|
17 |
-
"device": "cuda",
|
18 |
-
"no_cache": false,
|
19 |
-
"limit": null,
|
20 |
-
"bootstrap_iters": 100000,
|
21 |
-
"description_dict": {}
|
22 |
-
}
|
23 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evals/arc/arc_id-bloom-7b1.json
DELETED
@@ -1,23 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"arc_id": {
|
4 |
-
"acc": 0.3128205128205128,
|
5 |
-
"acc_stderr": 0.013560492090917607,
|
6 |
-
"acc_norm": 0.3598290598290598,
|
7 |
-
"acc_norm_stderr": 0.014037469945597791
|
8 |
-
}
|
9 |
-
},
|
10 |
-
"versions": {
|
11 |
-
"arc_id": 0
|
12 |
-
},
|
13 |
-
"config": {
|
14 |
-
"model": "hf-auto",
|
15 |
-
"model_args": "pretrained=bigscience/bloom-7b1",
|
16 |
-
"batch_size": 1,
|
17 |
-
"device": "cuda",
|
18 |
-
"no_cache": false,
|
19 |
-
"limit": null,
|
20 |
-
"bootstrap_iters": 100000,
|
21 |
-
"description_dict": {}
|
22 |
-
}
|
23 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evals/arc/arc_id-llama-7B.json
DELETED
@@ -1,23 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"arc_id": {
|
4 |
-
"acc": 0.19316239316239317,
|
5 |
-
"acc_stderr": 0.011546413314069014,
|
6 |
-
"acc_norm": 0.26666666666666666,
|
7 |
-
"acc_norm_stderr": 0.012933850109759573
|
8 |
-
}
|
9 |
-
},
|
10 |
-
"versions": {
|
11 |
-
"arc_id": 0
|
12 |
-
},
|
13 |
-
"config": {
|
14 |
-
"model": "hf-auto",
|
15 |
-
"model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
|
16 |
-
"batch_size": 1,
|
17 |
-
"device": "cuda",
|
18 |
-
"no_cache": false,
|
19 |
-
"limit": null,
|
20 |
-
"bootstrap_iters": 100000,
|
21 |
-
"description_dict": {}
|
22 |
-
}
|
23 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evals/arc/arc_it-bloom-7b1.json
DELETED
@@ -1,23 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"arc_it": {
|
4 |
-
"acc": 0.24037639007698888,
|
5 |
-
"acc_stderr": 0.01250327289928353,
|
6 |
-
"acc_norm": 0.28999144568006846,
|
7 |
-
"acc_norm_stderr": 0.01327709194338097
|
8 |
-
}
|
9 |
-
},
|
10 |
-
"versions": {
|
11 |
-
"arc_it": 0
|
12 |
-
},
|
13 |
-
"config": {
|
14 |
-
"model": "hf-auto",
|
15 |
-
"model_args": "pretrained=bigscience/bloom-7b1",
|
16 |
-
"batch_size": 1,
|
17 |
-
"device": "cuda",
|
18 |
-
"no_cache": false,
|
19 |
-
"limit": null,
|
20 |
-
"bootstrap_iters": 100000,
|
21 |
-
"description_dict": {}
|
22 |
-
}
|
23 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evals/arc/arc_it-llama-7B.json
DELETED
@@ -1,23 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"arc_it": {
|
4 |
-
"acc": 0.31736526946107785,
|
5 |
-
"acc_stderr": 0.013619227292898307,
|
6 |
-
"acc_norm": 0.3575705731394354,
|
7 |
-
"acc_norm_stderr": 0.014024008839912006
|
8 |
-
}
|
9 |
-
},
|
10 |
-
"versions": {
|
11 |
-
"arc_it": 0
|
12 |
-
},
|
13 |
-
"config": {
|
14 |
-
"model": "hf-auto",
|
15 |
-
"model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
|
16 |
-
"batch_size": 1,
|
17 |
-
"device": "cuda",
|
18 |
-
"no_cache": false,
|
19 |
-
"limit": null,
|
20 |
-
"bootstrap_iters": 100000,
|
21 |
-
"description_dict": {}
|
22 |
-
}
|
23 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evals/arc/arc_kn-bloom-7b1.json
DELETED
@@ -1,23 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"arc_kn": {
|
4 |
-
"acc": 0.2221254355400697,
|
5 |
-
"acc_stderr": 0.012273607270054452,
|
6 |
-
"acc_norm": 0.24738675958188153,
|
7 |
-
"acc_norm_stderr": 0.012740675198098838
|
8 |
-
}
|
9 |
-
},
|
10 |
-
"versions": {
|
11 |
-
"arc_kn": 0
|
12 |
-
},
|
13 |
-
"config": {
|
14 |
-
"model": "hf-auto",
|
15 |
-
"model_args": "pretrained=bigscience/bloom-7b1",
|
16 |
-
"batch_size": 1,
|
17 |
-
"device": "cuda",
|
18 |
-
"no_cache": false,
|
19 |
-
"limit": null,
|
20 |
-
"bootstrap_iters": 100000,
|
21 |
-
"description_dict": {}
|
22 |
-
}
|
23 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evals/arc/arc_kn-llama-7B.json
DELETED
@@ -1,23 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"arc_kn": {
|
4 |
-
"acc": 0.20470383275261325,
|
5 |
-
"acc_stderr": 0.011913674295957856,
|
6 |
-
"acc_norm": 0.24738675958188153,
|
7 |
-
"acc_norm_stderr": 0.012740675198098834
|
8 |
-
}
|
9 |
-
},
|
10 |
-
"versions": {
|
11 |
-
"arc_kn": 0
|
12 |
-
},
|
13 |
-
"config": {
|
14 |
-
"model": "hf-auto",
|
15 |
-
"model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
|
16 |
-
"batch_size": 1,
|
17 |
-
"device": "cuda",
|
18 |
-
"no_cache": false,
|
19 |
-
"limit": null,
|
20 |
-
"bootstrap_iters": 100000,
|
21 |
-
"description_dict": {}
|
22 |
-
}
|
23 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evals/arc/arc_ml-bloom-7b1.json
DELETED
@@ -1,23 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"arc_ml": {
|
4 |
-
"acc": 0.2075306479859895,
|
5 |
-
"acc_stderr": 0.01200575665793095,
|
6 |
-
"acc_norm": 0.2635726795096322,
|
7 |
-
"acc_norm_stderr": 0.013042844591075362
|
8 |
-
}
|
9 |
-
},
|
10 |
-
"versions": {
|
11 |
-
"arc_ml": 0
|
12 |
-
},
|
13 |
-
"config": {
|
14 |
-
"model": "hf-auto",
|
15 |
-
"model_args": "pretrained=bigscience/bloom-7b1",
|
16 |
-
"batch_size": 1,
|
17 |
-
"device": "cuda",
|
18 |
-
"no_cache": false,
|
19 |
-
"limit": null,
|
20 |
-
"bootstrap_iters": 100000,
|
21 |
-
"description_dict": {}
|
22 |
-
}
|
23 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evals/arc/arc_ml-llama-7B.json
DELETED
@@ -1,23 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"arc_ml": {
|
4 |
-
"acc": 0.21628721541155868,
|
5 |
-
"acc_stderr": 0.012188522634632977,
|
6 |
-
"acc_norm": 0.27845884413309985,
|
7 |
-
"acc_norm_stderr": 0.013269918016014967
|
8 |
-
}
|
9 |
-
},
|
10 |
-
"versions": {
|
11 |
-
"arc_ml": 0
|
12 |
-
},
|
13 |
-
"config": {
|
14 |
-
"model": "hf-auto",
|
15 |
-
"model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
|
16 |
-
"batch_size": 1,
|
17 |
-
"device": "cuda",
|
18 |
-
"no_cache": false,
|
19 |
-
"limit": null,
|
20 |
-
"bootstrap_iters": 100000,
|
21 |
-
"description_dict": {}
|
22 |
-
}
|
23 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evals/arc/arc_mr-bloom-7b1.json
DELETED
@@ -1,23 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"arc_mr": {
|
4 |
-
"acc": 0.23376623376623376,
|
5 |
-
"acc_stderr": 0.012458582396003653,
|
6 |
-
"acc_norm": 0.2727272727272727,
|
7 |
-
"acc_norm_stderr": 0.013110221561502926
|
8 |
-
}
|
9 |
-
},
|
10 |
-
"versions": {
|
11 |
-
"arc_mr": 0
|
12 |
-
},
|
13 |
-
"config": {
|
14 |
-
"model": "hf-auto",
|
15 |
-
"model_args": "pretrained=bigscience/bloom-7b1",
|
16 |
-
"batch_size": 1,
|
17 |
-
"device": "cuda",
|
18 |
-
"no_cache": false,
|
19 |
-
"limit": null,
|
20 |
-
"bootstrap_iters": 100000,
|
21 |
-
"description_dict": {}
|
22 |
-
}
|
23 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evals/arc/arc_mr-llama-7B.json
DELETED
@@ -1,23 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"arc_mr": {
|
4 |
-
"acc": 0.2051948051948052,
|
5 |
-
"acc_stderr": 0.011888050053276677,
|
6 |
-
"acc_norm": 0.2545454545454545,
|
7 |
-
"acc_norm_stderr": 0.012823020964319998
|
8 |
-
}
|
9 |
-
},
|
10 |
-
"versions": {
|
11 |
-
"arc_mr": 0
|
12 |
-
},
|
13 |
-
"config": {
|
14 |
-
"model": "hf-auto",
|
15 |
-
"model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
|
16 |
-
"batch_size": 1,
|
17 |
-
"device": "cuda",
|
18 |
-
"no_cache": false,
|
19 |
-
"limit": null,
|
20 |
-
"bootstrap_iters": 100000,
|
21 |
-
"description_dict": {}
|
22 |
-
}
|
23 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evals/arc/arc_ne-bloom-7b1.json
DELETED
@@ -1,23 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"arc_ne": {
|
4 |
-
"acc": 0.21300256629597947,
|
5 |
-
"acc_stderr": 0.01198002307808546,
|
6 |
-
"acc_norm": 0.223267750213858,
|
7 |
-
"acc_norm_stderr": 0.012185048029719049
|
8 |
-
}
|
9 |
-
},
|
10 |
-
"versions": {
|
11 |
-
"arc_ne": 0
|
12 |
-
},
|
13 |
-
"config": {
|
14 |
-
"model": "hf-auto",
|
15 |
-
"model_args": "pretrained=bigscience/bloom-7b1",
|
16 |
-
"batch_size": 1,
|
17 |
-
"device": "cuda",
|
18 |
-
"no_cache": false,
|
19 |
-
"limit": null,
|
20 |
-
"bootstrap_iters": 100000,
|
21 |
-
"description_dict": {}
|
22 |
-
}
|
23 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evals/arc/arc_ne-llama-7B.json
DELETED
@@ -1,23 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"arc_ne": {
|
4 |
-
"acc": 0.2172797262617622,
|
5 |
-
"acc_stderr": 0.012066782166932105,
|
6 |
-
"acc_norm": 0.24294268605645852,
|
7 |
-
"acc_norm_stderr": 0.012548588352773893
|
8 |
-
}
|
9 |
-
},
|
10 |
-
"versions": {
|
11 |
-
"arc_ne": 0
|
12 |
-
},
|
13 |
-
"config": {
|
14 |
-
"model": "hf-auto",
|
15 |
-
"model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
|
16 |
-
"batch_size": 1,
|
17 |
-
"device": "cuda",
|
18 |
-
"no_cache": false,
|
19 |
-
"limit": null,
|
20 |
-
"bootstrap_iters": 100000,
|
21 |
-
"description_dict": {}
|
22 |
-
}
|
23 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evals/arc/arc_nl_Llama-2-7b-chat-hf.json
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"arc_nl": {
|
4 |
+
"acc": 0.3609923011120616,
|
5 |
+
"acc_stderr": 0.014053373664144792,
|
6 |
+
"acc_norm": 0.3618477331052181,
|
7 |
+
"acc_norm_stderr": 0.014060593893704966
|
8 |
+
}
|
9 |
+
},
|
10 |
+
"versions": {
|
11 |
+
"arc_nl": 0
|
12 |
+
},
|
13 |
+
"config": {
|
14 |
+
"model": "hf-auto",
|
15 |
+
"model_args": "pretrained=meta-llama/Llama-2-7b-chat-hf,use_accelerate=True,device_map_option=auto,dtype=bfloat16",
|
16 |
+
"batch_size": 8,
|
17 |
+
"device": "cuda",
|
18 |
+
"no_cache": false,
|
19 |
+
"limit": null,
|
20 |
+
"bootstrap_iters": 100000,
|
21 |
+
"description_dict": {}
|
22 |
+
}
|
23 |
+
}
|
evals/{mmlu/mmlu_gu-bloom-7b1.json → arc/arc_nl_Llama-2-7b-hf.json}
RENAMED
@@ -1,19 +1,19 @@
|
|
1 |
{
|
2 |
"results": {
|
3 |
-
"
|
4 |
-
"acc": 0.
|
5 |
-
"acc_stderr": 0.
|
6 |
-
"acc_norm": 0.
|
7 |
-
"acc_norm_stderr": 0.
|
8 |
}
|
9 |
},
|
10 |
"versions": {
|
11 |
-
"
|
12 |
},
|
13 |
"config": {
|
14 |
"model": "hf-auto",
|
15 |
-
"model_args": "pretrained=
|
16 |
-
"batch_size":
|
17 |
"device": "cuda",
|
18 |
"no_cache": false,
|
19 |
"limit": null,
|
|
|
1 |
{
|
2 |
"results": {
|
3 |
+
"arc_nl": {
|
4 |
+
"acc": 0.33704020530367834,
|
5 |
+
"acc_stderr": 0.013831300903580639,
|
6 |
+
"acc_norm": 0.3567151411462789,
|
7 |
+
"acc_norm_stderr": 0.014016546277185005
|
8 |
}
|
9 |
},
|
10 |
"versions": {
|
11 |
+
"arc_nl": 0
|
12 |
},
|
13 |
"config": {
|
14 |
"model": "hf-auto",
|
15 |
+
"model_args": "pretrained=meta-llama/Llama-2-7b-hf,use_accelerate=True,device_map_option=auto,dtype=bfloat16",
|
16 |
+
"batch_size": 8,
|
17 |
"device": "cuda",
|
18 |
"no_cache": false,
|
19 |
"limit": null,
|
evals/arc/{arc_ar-llama-7B.json → arc_nl_Mistral-7B-v0.1.json}
RENAMED
@@ -1,19 +1,19 @@
|
|
1 |
{
|
2 |
"results": {
|
3 |
-
"
|
4 |
-
"acc": 0.
|
5 |
-
"acc_stderr": 0.
|
6 |
-
"acc_norm": 0.
|
7 |
-
"acc_norm_stderr": 0.
|
8 |
}
|
9 |
},
|
10 |
"versions": {
|
11 |
-
"
|
12 |
},
|
13 |
"config": {
|
14 |
"model": "hf-auto",
|
15 |
-
"model_args": "pretrained
|
16 |
-
"batch_size":
|
17 |
"device": "cuda",
|
18 |
"no_cache": false,
|
19 |
"limit": null,
|
|
|
1 |
{
|
2 |
"results": {
|
3 |
+
"arc_nl": {
|
4 |
+
"acc": 0.42087254063301965,
|
5 |
+
"acc_stderr": 0.014445778557368833,
|
6 |
+
"acc_norm": 0.4294268605645851,
|
7 |
+
"acc_norm_stderr": 0.014483677397351059
|
8 |
}
|
9 |
},
|
10 |
"versions": {
|
11 |
+
"arc_nl": 0
|
12 |
},
|
13 |
"config": {
|
14 |
"model": "hf-auto",
|
15 |
+
"model_args": "pretrained=mistralai/Mistral-7B-v0.1,use_accelerate=True,device_map_option=auto,dtype=bfloat16",
|
16 |
+
"batch_size": 8,
|
17 |
"device": "cuda",
|
18 |
"no_cache": false,
|
19 |
"limit": null,
|
evals/arc/{arc_bn-bloom-7b1.json → arc_nl_zephyr-7b-beta.json}
RENAMED
@@ -1,19 +1,19 @@
|
|
1 |
{
|
2 |
"results": {
|
3 |
-
"
|
4 |
-
"acc": 0.
|
5 |
-
"acc_stderr": 0.
|
6 |
-
"acc_norm": 0.
|
7 |
-
"acc_norm_stderr": 0.
|
8 |
}
|
9 |
},
|
10 |
"versions": {
|
11 |
-
"
|
12 |
},
|
13 |
"config": {
|
14 |
"model": "hf-auto",
|
15 |
-
"model_args": "pretrained=
|
16 |
-
"batch_size":
|
17 |
"device": "cuda",
|
18 |
"no_cache": false,
|
19 |
"limit": null,
|
|
|
1 |
{
|
2 |
"results": {
|
3 |
+
"arc_nl": {
|
4 |
+
"acc": 0.43798118049615054,
|
5 |
+
"acc_stderr": 0.01451716231691793,
|
6 |
+
"acc_norm": 0.4328485885372113,
|
7 |
+
"acc_norm_stderr": 0.01449759923259859
|
8 |
}
|
9 |
},
|
10 |
"versions": {
|
11 |
+
"arc_nl": 0
|
12 |
},
|
13 |
"config": {
|
14 |
"model": "hf-auto",
|
15 |
+
"model_args": "pretrained=HuggingFaceH4/zephyr-7b-beta,use_accelerate=True,device_map_option=auto,dtype=bfloat16",
|
16 |
+
"batch_size": 8,
|
17 |
"device": "cuda",
|
18 |
"no_cache": false,
|
19 |
"limit": null,
|
evals/arc/arc_pt-bloom-7b1.json
DELETED
@@ -1,23 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"arc_pt": {
|
4 |
-
"acc": 0.3401709401709402,
|
5 |
-
"acc_stderr": 0.013856612397310694,
|
6 |
-
"acc_norm": 0.4,
|
7 |
-
"acc_norm_stderr": 0.014328422047021531
|
8 |
-
}
|
9 |
-
},
|
10 |
-
"versions": {
|
11 |
-
"arc_pt": 0
|
12 |
-
},
|
13 |
-
"config": {
|
14 |
-
"model": "hf-auto",
|
15 |
-
"model_args": "pretrained=bigscience/bloom-7b1",
|
16 |
-
"batch_size": 1,
|
17 |
-
"device": "cuda",
|
18 |
-
"no_cache": false,
|
19 |
-
"limit": null,
|
20 |
-
"bootstrap_iters": 100000,
|
21 |
-
"description_dict": {}
|
22 |
-
}
|
23 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evals/arc/arc_pt-llama-7B.json
DELETED
@@ -1,23 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"arc_pt": {
|
4 |
-
"acc": 0.3367521367521368,
|
5 |
-
"acc_stderr": 0.01382247630777062,
|
6 |
-
"acc_norm": 0.37777777777777777,
|
7 |
-
"acc_norm_stderr": 0.014180244103534094
|
8 |
-
}
|
9 |
-
},
|
10 |
-
"versions": {
|
11 |
-
"arc_pt": 0
|
12 |
-
},
|
13 |
-
"config": {
|
14 |
-
"model": "hf-auto",
|
15 |
-
"model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
|
16 |
-
"batch_size": 1,
|
17 |
-
"device": "cuda",
|
18 |
-
"no_cache": false,
|
19 |
-
"limit": null,
|
20 |
-
"bootstrap_iters": 100000,
|
21 |
-
"description_dict": {}
|
22 |
-
}
|
23 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evals/arc/arc_ro-bloom-7b1.json
DELETED
@@ -1,23 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"arc_ro": {
|
4 |
-
"acc": 0.2099400171379606,
|
5 |
-
"acc_stderr": 0.011926921791273557,
|
6 |
-
"acc_norm": 0.26906598114824337,
|
7 |
-
"acc_norm_stderr": 0.012987310039914976
|
8 |
-
}
|
9 |
-
},
|
10 |
-
"versions": {
|
11 |
-
"arc_ro": 0
|
12 |
-
},
|
13 |
-
"config": {
|
14 |
-
"model": "hf-auto",
|
15 |
-
"model_args": "pretrained=bigscience/bloom-7b1",
|
16 |
-
"batch_size": 1,
|
17 |
-
"device": "cuda",
|
18 |
-
"no_cache": false,
|
19 |
-
"limit": null,
|
20 |
-
"bootstrap_iters": 100000,
|
21 |
-
"description_dict": {}
|
22 |
-
}
|
23 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evals/arc/arc_ro-llama-7B.json
DELETED
@@ -1,23 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"arc_ro": {
|
4 |
-
"acc": 0.30077120822622105,
|
5 |
-
"acc_stderr": 0.013430077114209907,
|
6 |
-
"acc_norm": 0.32390745501285345,
|
7 |
-
"acc_norm_stderr": 0.013704533924425027
|
8 |
-
}
|
9 |
-
},
|
10 |
-
"versions": {
|
11 |
-
"arc_ro": 0
|
12 |
-
},
|
13 |
-
"config": {
|
14 |
-
"model": "hf-auto",
|
15 |
-
"model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
|
16 |
-
"batch_size": 1,
|
17 |
-
"device": "cuda",
|
18 |
-
"no_cache": false,
|
19 |
-
"limit": null,
|
20 |
-
"bootstrap_iters": 100000,
|
21 |
-
"description_dict": {}
|
22 |
-
}
|
23 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evals/arc/arc_ru-bloom-7b1.json
DELETED
@@ -1,23 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"arc_ru": {
|
4 |
-
"acc": 0.21043627031650983,
|
5 |
-
"acc_stderr": 0.01192703439080346,
|
6 |
-
"acc_norm": 0.2754491017964072,
|
7 |
-
"acc_norm_stderr": 0.01307174925264165
|
8 |
-
}
|
9 |
-
},
|
10 |
-
"versions": {
|
11 |
-
"arc_ru": 0
|
12 |
-
},
|
13 |
-
"config": {
|
14 |
-
"model": "hf-auto",
|
15 |
-
"model_args": "pretrained=bigscience/bloom-7b1",
|
16 |
-
"batch_size": 1,
|
17 |
-
"device": "cuda",
|
18 |
-
"no_cache": false,
|
19 |
-
"limit": null,
|
20 |
-
"bootstrap_iters": 100000,
|
21 |
-
"description_dict": {}
|
22 |
-
}
|
23 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evals/arc/arc_ru-llama-7B.json
DELETED
@@ -1,23 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"arc_ru": {
|
4 |
-
"acc": 0.2934131736526946,
|
5 |
-
"acc_stderr": 0.013322973103306575,
|
6 |
-
"acc_norm": 0.32078699743370404,
|
7 |
-
"acc_norm_stderr": 0.013658089444975752
|
8 |
-
}
|
9 |
-
},
|
10 |
-
"versions": {
|
11 |
-
"arc_ru": 0
|
12 |
-
},
|
13 |
-
"config": {
|
14 |
-
"model": "hf-auto",
|
15 |
-
"model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
|
16 |
-
"batch_size": 1,
|
17 |
-
"device": "cuda",
|
18 |
-
"no_cache": false,
|
19 |
-
"limit": null,
|
20 |
-
"bootstrap_iters": 100000,
|
21 |
-
"description_dict": {}
|
22 |
-
}
|
23 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evals/arc/arc_sk-bloom-7b1.json
DELETED
@@ -1,23 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"arc_sk": {
|
4 |
-
"acc": 0.20359281437125748,
|
5 |
-
"acc_stderr": 0.011782227020010716,
|
6 |
-
"acc_norm": 0.24893071000855432,
|
7 |
-
"acc_norm_stderr": 0.012651960282598879
|
8 |
-
}
|
9 |
-
},
|
10 |
-
"versions": {
|
11 |
-
"arc_sk": 0
|
12 |
-
},
|
13 |
-
"config": {
|
14 |
-
"model": "hf-auto",
|
15 |
-
"model_args": "pretrained=bigscience/bloom-7b1",
|
16 |
-
"batch_size": 1,
|
17 |
-
"device": "cuda",
|
18 |
-
"no_cache": false,
|
19 |
-
"limit": null,
|
20 |
-
"bootstrap_iters": 100000,
|
21 |
-
"description_dict": {}
|
22 |
-
}
|
23 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evals/arc/arc_sk-llama-7B.json
DELETED
@@ -1,23 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"arc_sk": {
|
4 |
-
"acc": 0.23609923011120615,
|
5 |
-
"acc_stderr": 0.012426371635795894,
|
6 |
-
"acc_norm": 0.28999144568006846,
|
7 |
-
"acc_norm_stderr": 0.013277091943380979
|
8 |
-
}
|
9 |
-
},
|
10 |
-
"versions": {
|
11 |
-
"arc_sk": 0
|
12 |
-
},
|
13 |
-
"config": {
|
14 |
-
"model": "hf-auto",
|
15 |
-
"model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
|
16 |
-
"batch_size": 1,
|
17 |
-
"device": "cuda",
|
18 |
-
"no_cache": false,
|
19 |
-
"limit": null,
|
20 |
-
"bootstrap_iters": 100000,
|
21 |
-
"description_dict": {}
|
22 |
-
}
|
23 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evals/arc/arc_sr-bloom-7b1.json
DELETED
@@ -1,23 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"arc_sr": {
|
4 |
-
"acc": 0.2172797262617622,
|
5 |
-
"acc_stderr": 0.012066782166932079,
|
6 |
-
"acc_norm": 0.25149700598802394,
|
7 |
-
"acc_norm_stderr": 0.01269526466186626
|
8 |
-
}
|
9 |
-
},
|
10 |
-
"versions": {
|
11 |
-
"arc_sr": 0
|
12 |
-
},
|
13 |
-
"config": {
|
14 |
-
"model": "hf-auto",
|
15 |
-
"model_args": "pretrained=bigscience/bloom-7b1",
|
16 |
-
"batch_size": 1,
|
17 |
-
"device": "cuda",
|
18 |
-
"no_cache": false,
|
19 |
-
"limit": null,
|
20 |
-
"bootstrap_iters": 100000,
|
21 |
-
"description_dict": {}
|
22 |
-
}
|
23 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evals/arc/arc_sr-llama-7B.json
DELETED
@@ -1,23 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"results": {
|
3 |
-
"arc_sr": {
|
4 |
-
"acc": 0.25748502994011974,
|
5 |
-
"acc_stderr": 0.012794024494042348,
|
6 |
-
"acc_norm": 0.30795551753635586,
|
7 |
-
"acc_norm_stderr": 0.013507954174822524
|
8 |
-
}
|
9 |
-
},
|
10 |
-
"versions": {
|
11 |
-
"arc_sr": 0
|
12 |
-
},
|
13 |
-
"config": {
|
14 |
-
"model": "hf-auto",
|
15 |
-
"model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
|
16 |
-
"batch_size": 1,
|
17 |
-
"device": "cuda",
|
18 |
-
"no_cache": false,
|
19 |
-
"limit": null,
|
20 |
-
"bootstrap_iters": 100000,
|
21 |
-
"description_dict": {}
|
22 |
-
}
|
23 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|