Bram Vanroy commited on
Commit
863e074
1 Parent(s): 095087c

update with only Dutch

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. app.py +54 -60
  2. css.py +2 -2
  3. evals/arc/arc_ca-bloom-7b1.json +0 -23
  4. evals/arc/arc_ca-llama-7B.json +0 -23
  5. evals/arc/arc_da-bloom-7b1.json +0 -23
  6. evals/arc/arc_da-llama-7B.json +0 -23
  7. evals/arc/arc_de-bloom-7b1.json +0 -23
  8. evals/arc/arc_de-llama-7B.json +0 -23
  9. evals/arc/arc_es-bloom-7b1.json +0 -23
  10. evals/arc/arc_es-llama-7B.json +0 -23
  11. evals/arc/arc_eu-bloom-7b1.json +0 -23
  12. evals/arc/arc_eu-llama-7B.json +0 -23
  13. evals/arc/arc_fr-bloom-7b1.json +0 -23
  14. evals/arc/arc_fr-llama-7B.json +0 -23
  15. evals/arc/arc_gu-bloom-7b1.json +0 -23
  16. evals/arc/arc_gu-llama-7B.json +0 -23
  17. evals/arc/arc_hi-bloom-7b1.json +0 -23
  18. evals/arc/arc_hi-llama-7B.json +0 -23
  19. evals/arc/arc_hr-bloom-7b1.json +0 -23
  20. evals/arc/arc_hr-llama-7B.json +0 -23
  21. evals/arc/arc_hu-bloom-7b1.json +0 -23
  22. evals/arc/arc_hu-llama-7B.json +0 -23
  23. evals/arc/arc_hy-bloom-7b1.json +0 -23
  24. evals/arc/arc_hy-llama-7B.json +0 -23
  25. evals/arc/arc_id-bloom-7b1.json +0 -23
  26. evals/arc/arc_id-llama-7B.json +0 -23
  27. evals/arc/arc_it-bloom-7b1.json +0 -23
  28. evals/arc/arc_it-llama-7B.json +0 -23
  29. evals/arc/arc_kn-bloom-7b1.json +0 -23
  30. evals/arc/arc_kn-llama-7B.json +0 -23
  31. evals/arc/arc_ml-bloom-7b1.json +0 -23
  32. evals/arc/arc_ml-llama-7B.json +0 -23
  33. evals/arc/arc_mr-bloom-7b1.json +0 -23
  34. evals/arc/arc_mr-llama-7B.json +0 -23
  35. evals/arc/arc_ne-bloom-7b1.json +0 -23
  36. evals/arc/arc_ne-llama-7B.json +0 -23
  37. evals/arc/arc_nl_Llama-2-7b-chat-hf.json +23 -0
  38. evals/{mmlu/mmlu_gu-bloom-7b1.json → arc/arc_nl_Llama-2-7b-hf.json} +8 -8
  39. evals/arc/{arc_ar-llama-7B.json → arc_nl_Mistral-7B-v0.1.json} +8 -8
  40. evals/arc/{arc_bn-bloom-7b1.json → arc_nl_zephyr-7b-beta.json} +8 -8
  41. evals/arc/arc_pt-bloom-7b1.json +0 -23
  42. evals/arc/arc_pt-llama-7B.json +0 -23
  43. evals/arc/arc_ro-bloom-7b1.json +0 -23
  44. evals/arc/arc_ro-llama-7B.json +0 -23
  45. evals/arc/arc_ru-bloom-7b1.json +0 -23
  46. evals/arc/arc_ru-llama-7B.json +0 -23
  47. evals/arc/arc_sk-bloom-7b1.json +0 -23
  48. evals/arc/arc_sk-llama-7B.json +0 -23
  49. evals/arc/arc_sr-bloom-7b1.json +0 -23
  50. evals/arc/arc_sr-llama-7B.json +0 -23
app.py CHANGED
@@ -1,9 +1,10 @@
1
- import os
2
  import json
3
- import glob
4
  from collections import defaultdict
 
 
5
  import pandas as pd
6
  import gradio as gr
 
7
  from content import *
8
  from css import *
9
  import glob
@@ -16,74 +17,74 @@ BENCHMARKS = [ARC, HELLASWAG, MMLU, TRUTHFULQA]
16
 
17
  METRICS = ["acc_norm", "acc_norm", "acc_norm", "mc2"]
18
 
19
- LANGS = 'ar,bn,ca,da,de,es,eu,fr,gu,hi,hr,hu,hy,id,it,kn,ml,mr,ne,nl,pt,ro,ru,sk,sr,sv,ta,te,uk,vi,zh'.split(',')
20
 
21
  LANG_NAME = {
22
- 'ar': 'Arabic',
23
- 'bn': 'Bengali',
24
- 'ca': 'Catalan',
25
- 'da': 'Danish',
26
- 'de': 'German',
27
- 'es': 'Spanish',
28
- 'eu': 'Basque',
29
- 'fr': 'French',
30
- 'gu': 'Gujarati',
31
- 'hi': 'Hindi',
32
- 'hr': 'Croatian',
33
- 'hu': 'Hungarian',
34
- 'hy': 'Armenian',
35
- 'id': 'Indonesian',
36
- 'it': 'Italian',
37
- 'kn': 'Kannada',
38
- 'ml': 'Malayalam',
39
- 'mr': 'Marathi',
40
- 'ne': 'Nepali',
41
- 'nl': 'Dutch',
42
- 'pt': 'Portuguese',
43
- 'ro': 'Romanian',
44
- 'ru': 'Russian',
45
- 'sk': 'Slovak',
46
- 'sr': 'Serbian',
47
- 'sv': 'Swedish',
48
- 'ta': 'Tamil',
49
- 'te': 'Telugu',
50
- 'uk': 'Ukrainian',
51
- 'vi': 'Vietnamese',
52
- 'zh': 'Chinese'
53
  }
54
 
55
 
56
  def collect_results():
57
  performance_dict = defaultdict(dict)
58
  pretrained_models = set()
59
- for file in glob.glob('evals/*/*.json'):
60
- with open(file, 'r') as f:
61
- data = json.load(f)
62
- if 'results' not in data:
63
  continue
64
- if 'config' not in data:
65
  continue
66
- results = data['results']
67
- config = data['config']
68
- if 'model_args' not in config:
69
  continue
70
 
71
- model_args = config['model_args'].split(',')
72
- pretrained = [x for x in model_args if x.startswith('pretrained=')]
73
  if len(pretrained) != 1:
74
  continue
75
- pretrained = pretrained[0].split('=')[1]
76
- pretrained = pretrained.split('/')[-1]
77
  pretrained_models.add(pretrained)
78
 
79
  for lang_task, perfs in results.items():
80
- task, lang = lang_task.split('_')
81
  assert task in BENCHMARKS
82
 
83
  if lang and task:
84
  metric = METRICS[BENCHMARKS.index(task)]
85
  p = round(perfs[metric] * 100, 1)
86
  performance_dict[(pretrained, lang)][task] = p
 
87
  return performance_dict, pretrained_models
88
 
89
 
@@ -96,15 +97,13 @@ def get_leaderboard_df(performance_dict, pretrained_models):
96
  mmlu_perf = perfs.get(MMLU, 0.0)
97
  truthfulqa_perf = perfs.get(TRUTHFULQA, 0.0)
98
 
99
- if arc_perf * hellaswag_perf * mmlu_perf * truthfulqa_perf == 0:
100
- continue
101
  avg = round((arc_perf + hellaswag_perf + mmlu_perf + truthfulqa_perf) / 4, 1)
102
- notes = ' '.join([pretrained, lang_name])
103
- row = [pretrained, lang_name, lang, avg, arc_perf, hellaswag_perf, mmlu_perf, truthfulqa_perf, notes]
104
  df.append(row)
105
 
106
  df = pd.DataFrame.from_records(df, columns=COLS)
107
- df = df.sort_values(by=[LANG_COL, AVERAGE_COL], ascending=False)
108
  df = df[COLS]
109
 
110
  return df
@@ -115,10 +114,7 @@ def search_table(df, query):
115
  return filtered_df
116
 
117
 
118
-
119
  MODEL_COL = "Model"
120
- LANG_COL = "Language"
121
- CODE_COL = "Code"
122
  AVERAGE_COL = "Average"
123
  ARC_COL = "ARC (25-shot)"
124
  HELLASWAG_COL = "HellaSwag (10-shot)️"
@@ -126,8 +122,8 @@ MMLU_COL = "MMLU (5-shot)"
126
  TRUTHFULQA_COL = "TruthfulQA (0-shot)"
127
  NOTES_COL = "Notes" # For search only
128
 
129
- COLS = [MODEL_COL, LANG_COL, CODE_COL, AVERAGE_COL, ARC_COL, HELLASWAG_COL, MMLU_COL, TRUTHFULQA_COL, NOTES_COL]
130
- TYPES = ["str", "str", "str", "number", "number", "number", "number", "number", "str"]
131
 
132
  args = collect_results()
133
  original_df = get_leaderboard_df(*args)
@@ -139,9 +135,7 @@ with demo:
139
  gr.Markdown(HOW_TO, elem_classes="markdown-text")
140
 
141
  with gr.Box():
142
- search_bar = gr.Textbox(
143
- placeholder="Search models and languages...", show_label=False, elem_id="search-bar"
144
- )
145
 
146
  leaderboard_table = gr.components.Dataframe(
147
  value=original_df,
 
 
1
  import json
 
2
  from collections import defaultdict
3
+ from pathlib import Path
4
+
5
  import pandas as pd
6
  import gradio as gr
7
+
8
  from content import *
9
  from css import *
10
  import glob
 
17
 
18
  METRICS = ["acc_norm", "acc_norm", "acc_norm", "mc2"]
19
 
20
+ LANGS = "ar,bn,ca,da,de,es,eu,fr,gu,hi,hr,hu,hy,id,it,kn,ml,mr,ne,nl,pt,ro,ru,sk,sr,sv,ta,te,uk,vi,zh".split(",")
21
 
22
  LANG_NAME = {
23
+ "ar": "Arabic",
24
+ "bn": "Bengali",
25
+ "ca": "Catalan",
26
+ "da": "Danish",
27
+ "de": "German",
28
+ "es": "Spanish",
29
+ "eu": "Basque",
30
+ "fr": "French",
31
+ "gu": "Gujarati",
32
+ "hi": "Hindi",
33
+ "hr": "Croatian",
34
+ "hu": "Hungarian",
35
+ "hy": "Armenian",
36
+ "id": "Indonesian",
37
+ "it": "Italian",
38
+ "kn": "Kannada",
39
+ "ml": "Malayalam",
40
+ "mr": "Marathi",
41
+ "ne": "Nepali",
42
+ "nl": "Dutch",
43
+ "pt": "Portuguese",
44
+ "ro": "Romanian",
45
+ "ru": "Russian",
46
+ "sk": "Slovak",
47
+ "sr": "Serbian",
48
+ "sv": "Swedish",
49
+ "ta": "Tamil",
50
+ "te": "Telugu",
51
+ "uk": "Ukrainian",
52
+ "vi": "Vietnamese",
53
+ "zh": "Chinese",
54
  }
55
 
56
 
57
  def collect_results():
58
  performance_dict = defaultdict(dict)
59
  pretrained_models = set()
60
+ for pfin in Path("evals").rglob("*.json"):
61
+ data = json.loads(pfin.read_text(encoding="utf-8"))
62
+ if "results" not in data:
 
63
  continue
64
+ if "config" not in data:
65
  continue
66
+ results = data["results"]
67
+ config = data["config"]
68
+ if "model_args" not in config:
69
  continue
70
 
71
+ model_args = config["model_args"].split(",")
72
+ pretrained = [x for x in model_args if x.startswith("pretrained=")]
73
  if len(pretrained) != 1:
74
  continue
75
+ pretrained = pretrained[0].split("=")[1]
76
+ pretrained = pretrained.split("/")[-1]
77
  pretrained_models.add(pretrained)
78
 
79
  for lang_task, perfs in results.items():
80
+ task, lang = lang_task.split("_")
81
  assert task in BENCHMARKS
82
 
83
  if lang and task:
84
  metric = METRICS[BENCHMARKS.index(task)]
85
  p = round(perfs[metric] * 100, 1)
86
  performance_dict[(pretrained, lang)][task] = p
87
+
88
  return performance_dict, pretrained_models
89
 
90
 
 
97
  mmlu_perf = perfs.get(MMLU, 0.0)
98
  truthfulqa_perf = perfs.get(TRUTHFULQA, 0.0)
99
 
 
 
100
  avg = round((arc_perf + hellaswag_perf + mmlu_perf + truthfulqa_perf) / 4, 1)
101
+ notes = " ".join([pretrained, lang_name])
102
+ row = [pretrained, avg, arc_perf, hellaswag_perf, mmlu_perf, truthfulqa_perf, notes]
103
  df.append(row)
104
 
105
  df = pd.DataFrame.from_records(df, columns=COLS)
106
+ df = df.sort_values(by=[AVERAGE_COL], ascending=False)
107
  df = df[COLS]
108
 
109
  return df
 
114
  return filtered_df
115
 
116
 
 
117
  MODEL_COL = "Model"
 
 
118
  AVERAGE_COL = "Average"
119
  ARC_COL = "ARC (25-shot)"
120
  HELLASWAG_COL = "HellaSwag (10-shot)️"
 
122
  TRUTHFULQA_COL = "TruthfulQA (0-shot)"
123
  NOTES_COL = "Notes" # For search only
124
 
125
+ COLS = [MODEL_COL, AVERAGE_COL, ARC_COL, HELLASWAG_COL, MMLU_COL, TRUTHFULQA_COL, NOTES_COL]
126
+ TYPES = ["str", "number", "number", "number", "number", "number", "str"]
127
 
128
  args = collect_results()
129
  original_df = get_leaderboard_df(*args)
 
135
  gr.Markdown(HOW_TO, elem_classes="markdown-text")
136
 
137
  with gr.Box():
138
+ search_bar = gr.Textbox(placeholder="Search models and languages...", show_label=False, elem_id="search-bar")
 
 
139
 
140
  leaderboard_table = gr.components.Dataframe(
141
  value=original_df,
css.py CHANGED
@@ -1,4 +1,4 @@
1
- CUSTOM_CSS= """
2
  /* Hides the final column */
3
  table td:last-child,
4
  table th:last-child {
@@ -10,4 +10,4 @@ table th:last-child {
10
  # overflow: auto;
11
  # white-space: nowrap;
12
  # }
13
- """
 
1
+ CUSTOM_CSS = """
2
  /* Hides the final column */
3
  table td:last-child,
4
  table th:last-child {
 
10
  # overflow: auto;
11
  # white-space: nowrap;
12
  # }
13
+ """
evals/arc/arc_ca-bloom-7b1.json DELETED
@@ -1,23 +0,0 @@
1
- {
2
- "results": {
3
- "arc_ca": {
4
- "acc": 0.31989708404802747,
5
- "acc_stderr": 0.01366562491926326,
6
- "acc_norm": 0.34734133790737565,
7
- "acc_norm_stderr": 0.013949489903701517
8
- }
9
- },
10
- "versions": {
11
- "arc_ca": 0
12
- },
13
- "config": {
14
- "model": "hf-auto",
15
- "model_args": "pretrained=bigscience/bloom-7b1",
16
- "batch_size": 1,
17
- "device": "cuda",
18
- "no_cache": false,
19
- "limit": null,
20
- "bootstrap_iters": 100000,
21
- "description_dict": {}
22
- }
23
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evals/arc/arc_ca-llama-7B.json DELETED
@@ -1,23 +0,0 @@
1
- {
2
- "results": {
3
- "arc_ca": {
4
- "acc": 0.3276157804459691,
5
- "acc_stderr": 0.01375080741597368,
6
- "acc_norm": 0.3507718696397942,
7
- "acc_norm_stderr": 0.013981316936172217
8
- }
9
- },
10
- "versions": {
11
- "arc_ca": 0
12
- },
13
- "config": {
14
- "model": "hf-auto",
15
- "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
16
- "batch_size": 1,
17
- "device": "cuda",
18
- "no_cache": false,
19
- "limit": null,
20
- "bootstrap_iters": 100000,
21
- "description_dict": {}
22
- }
23
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evals/arc/arc_da-bloom-7b1.json DELETED
@@ -1,23 +0,0 @@
1
- {
2
- "results": {
3
- "arc_da": {
4
- "acc": 0.20137103684661525,
5
- "acc_stderr": 0.011744154502532795,
6
- "acc_norm": 0.24592973436161097,
7
- "acc_norm_stderr": 0.012611366681285752
8
- }
9
- },
10
- "versions": {
11
- "arc_da": 0
12
- },
13
- "config": {
14
- "model": "hf-auto",
15
- "model_args": "pretrained=bigscience/bloom-7b1",
16
- "batch_size": 1,
17
- "device": "cuda",
18
- "no_cache": false,
19
- "limit": null,
20
- "bootstrap_iters": 100000,
21
- "description_dict": {}
22
- }
23
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evals/arc/arc_da-llama-7B.json DELETED
@@ -1,23 +0,0 @@
1
- {
2
- "results": {
3
- "arc_da": {
4
- "acc": 0.286203941730934,
5
- "acc_stderr": 0.013236574332463879,
6
- "acc_norm": 0.3273350471293916,
7
- "acc_norm_stderr": 0.013741887176251822
8
- }
9
- },
10
- "versions": {
11
- "arc_da": 0
12
- },
13
- "config": {
14
- "model": "hf-auto",
15
- "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
16
- "batch_size": 1,
17
- "device": "cuda",
18
- "no_cache": false,
19
- "limit": null,
20
- "bootstrap_iters": 100000,
21
- "description_dict": {}
22
- }
23
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evals/arc/arc_de-bloom-7b1.json DELETED
@@ -1,23 +0,0 @@
1
- {
2
- "results": {
3
- "arc_de": {
4
- "acc": 0.22241231822070145,
5
- "acc_stderr": 0.012168377742629776,
6
- "acc_norm": 0.262617621899059,
7
- "acc_norm_stderr": 0.01287617552045283
8
- }
9
- },
10
- "versions": {
11
- "arc_de": 0
12
- },
13
- "config": {
14
- "model": "hf-auto",
15
- "model_args": "pretrained=bigscience/bloom-7b1",
16
- "batch_size": 1,
17
- "device": "cuda",
18
- "no_cache": false,
19
- "limit": null,
20
- "bootstrap_iters": 100000,
21
- "description_dict": {}
22
- }
23
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evals/arc/arc_de-llama-7B.json DELETED
@@ -1,23 +0,0 @@
1
- {
2
- "results": {
3
- "arc_de": {
4
- "acc": 0.2951240376390077,
5
- "acc_stderr": 0.013345572865502645,
6
- "acc_norm": 0.35072711719418304,
7
- "acc_norm_stderr": 0.013962940383743043
8
- }
9
- },
10
- "versions": {
11
- "arc_de": 0
12
- },
13
- "config": {
14
- "model": "hf-auto",
15
- "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
16
- "batch_size": 1,
17
- "device": "cuda",
18
- "no_cache": false,
19
- "limit": null,
20
- "bootstrap_iters": 100000,
21
- "description_dict": {}
22
- }
23
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evals/arc/arc_es-bloom-7b1.json DELETED
@@ -1,23 +0,0 @@
1
- {
2
- "results": {
3
- "arc_es": {
4
- "acc": 0.3316239316239316,
5
- "acc_stderr": 0.013769752111910177,
6
- "acc_norm": 0.3811965811965812,
7
- "acc_norm_stderr": 0.01420507709573084
8
- }
9
- },
10
- "versions": {
11
- "arc_es": 0
12
- },
13
- "config": {
14
- "model": "hf-auto",
15
- "model_args": "pretrained=bigscience/bloom-7b1",
16
- "batch_size": 1,
17
- "device": "cuda",
18
- "no_cache": false,
19
- "limit": null,
20
- "bootstrap_iters": 100000,
21
- "description_dict": {}
22
- }
23
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evals/arc/arc_es-llama-7B.json DELETED
@@ -1,23 +0,0 @@
1
- {
2
- "results": {
3
- "arc_es": {
4
- "acc": 0.3606837606837607,
5
- "acc_stderr": 0.014044746572948867,
6
- "acc_norm": 0.3683760683760684,
7
- "acc_norm_stderr": 0.014108074259155369
8
- }
9
- },
10
- "versions": {
11
- "arc_es": 0
12
- },
13
- "config": {
14
- "model": "hf-auto",
15
- "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
16
- "batch_size": 1,
17
- "device": "cuda",
18
- "no_cache": false,
19
- "limit": null,
20
- "bootstrap_iters": 100000,
21
- "description_dict": {}
22
- }
23
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evals/arc/arc_eu-bloom-7b1.json DELETED
@@ -1,23 +0,0 @@
1
- {
2
- "results": {
3
- "arc_eu": {
4
- "acc": 0.22056239015817222,
5
- "acc_stderr": 0.01229634886589257,
6
- "acc_norm": 0.2521968365553603,
7
- "acc_norm_stderr": 0.012879032347922939
8
- }
9
- },
10
- "versions": {
11
- "arc_eu": 0
12
- },
13
- "config": {
14
- "model": "hf-auto",
15
- "model_args": "pretrained=bigscience/bloom-7b1",
16
- "batch_size": 1,
17
- "device": "cuda",
18
- "no_cache": false,
19
- "limit": null,
20
- "bootstrap_iters": 100000,
21
- "description_dict": {}
22
- }
23
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evals/arc/arc_eu-llama-7B.json DELETED
@@ -1,23 +0,0 @@
1
- {
2
- "results": {
3
- "arc_eu": {
4
- "acc": 0.20738137082601055,
5
- "acc_stderr": 0.012023662461166562,
6
- "acc_norm": 0.2451669595782074,
7
- "acc_norm_stderr": 0.012757811738008544
8
- }
9
- },
10
- "versions": {
11
- "arc_eu": 0
12
- },
13
- "config": {
14
- "model": "hf-auto",
15
- "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
16
- "batch_size": 1,
17
- "device": "cuda",
18
- "no_cache": false,
19
- "limit": null,
20
- "bootstrap_iters": 100000,
21
- "description_dict": {}
22
- }
23
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evals/arc/arc_fr-bloom-7b1.json DELETED
@@ -1,23 +0,0 @@
1
- {
2
- "results": {
3
- "arc_fr": {
4
- "acc": 0.32677502138579984,
5
- "acc_stderr": 0.01372407602199982,
6
- "acc_norm": 0.3669803250641574,
7
- "acc_norm_stderr": 0.014102904772197396
8
- }
9
- },
10
- "versions": {
11
- "arc_fr": 0
12
- },
13
- "config": {
14
- "model": "hf-auto",
15
- "model_args": "pretrained=bigscience/bloom-7b1",
16
- "batch_size": 1,
17
- "device": "cuda",
18
- "no_cache": false,
19
- "limit": null,
20
- "bootstrap_iters": 100000,
21
- "description_dict": {}
22
- }
23
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evals/arc/arc_fr-llama-7B.json DELETED
@@ -1,23 +0,0 @@
1
- {
2
- "results": {
3
- "arc_fr": {
4
- "acc": 0.3473053892215569,
5
- "acc_stderr": 0.013931226499492353,
6
- "acc_norm": 0.3729683490162532,
7
- "acc_norm_stderr": 0.014150093168782438
8
- }
9
- },
10
- "versions": {
11
- "arc_fr": 0
12
- },
13
- "config": {
14
- "model": "hf-auto",
15
- "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
16
- "batch_size": 1,
17
- "device": "cuda",
18
- "no_cache": false,
19
- "limit": null,
20
- "bootstrap_iters": 100000,
21
- "description_dict": {}
22
- }
23
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evals/arc/arc_gu-bloom-7b1.json DELETED
@@ -1,23 +0,0 @@
1
- {
2
- "results": {
3
- "arc_gu": {
4
- "acc": 0.2206896551724138,
5
- "acc_stderr": 0.012181604374453973,
6
- "acc_norm": 0.2336206896551724,
7
- "acc_norm_stderr": 0.012428989430945793
8
- }
9
- },
10
- "versions": {
11
- "arc_gu": 0
12
- },
13
- "config": {
14
- "model": "hf-auto",
15
- "model_args": "pretrained=bigscience/bloom-7b1",
16
- "batch_size": 1,
17
- "device": "cuda",
18
- "no_cache": false,
19
- "limit": null,
20
- "bootstrap_iters": 100000,
21
- "description_dict": {}
22
- }
23
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evals/arc/arc_gu-llama-7B.json DELETED
@@ -1,23 +0,0 @@
1
- {
2
- "results": {
3
- "arc_gu": {
4
- "acc": 0.2120689655172414,
5
- "acc_stderr": 0.012007177871292825,
6
- "acc_norm": 0.23189655172413792,
7
- "acc_norm_stderr": 0.012396962423413033
8
- }
9
- },
10
- "versions": {
11
- "arc_gu": 0
12
- },
13
- "config": {
14
- "model": "hf-auto",
15
- "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
16
- "batch_size": 1,
17
- "device": "cuda",
18
- "no_cache": false,
19
- "limit": null,
20
- "bootstrap_iters": 100000,
21
- "description_dict": {}
22
- }
23
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evals/arc/arc_hi-bloom-7b1.json DELETED
@@ -1,23 +0,0 @@
1
- {
2
- "results": {
3
- "arc_hi": {
4
- "acc": 0.2363013698630137,
5
- "acc_stderr": 0.012435369590403731,
6
- "acc_norm": 0.2919520547945205,
7
- "acc_norm_stderr": 0.013309191484613488
8
- }
9
- },
10
- "versions": {
11
- "arc_hi": 0
12
- },
13
- "config": {
14
- "model": "hf-auto",
15
- "model_args": "pretrained=bigscience/bloom-7b1",
16
- "batch_size": 1,
17
- "device": "cuda",
18
- "no_cache": false,
19
- "limit": null,
20
- "bootstrap_iters": 100000,
21
- "description_dict": {}
22
- }
23
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evals/arc/arc_hi-llama-7B.json DELETED
@@ -1,23 +0,0 @@
1
- {
2
- "results": {
3
- "arc_hi": {
4
- "acc": 0.21232876712328766,
5
- "acc_stderr": 0.011971304657273123,
6
- "acc_norm": 0.25,
7
- "acc_norm_stderr": 0.012675503164084846
8
- }
9
- },
10
- "versions": {
11
- "arc_hi": 0
12
- },
13
- "config": {
14
- "model": "hf-auto",
15
- "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
16
- "batch_size": 1,
17
- "device": "cuda",
18
- "no_cache": false,
19
- "limit": null,
20
- "bootstrap_iters": 100000,
21
- "description_dict": {}
22
- }
23
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evals/arc/arc_hr-bloom-7b1.json DELETED
@@ -1,23 +0,0 @@
1
- {
2
- "results": {
3
- "arc_hr": {
4
- "acc": 0.19332763045337895,
5
- "acc_stderr": 0.011555111310342437,
6
- "acc_norm": 0.2369546621043627,
7
- "acc_norm_stderr": 0.012441890624187792
8
- }
9
- },
10
- "versions": {
11
- "arc_hr": 0
12
- },
13
- "config": {
14
- "model": "hf-auto",
15
- "model_args": "pretrained=bigscience/bloom-7b1",
16
- "batch_size": 1,
17
- "device": "cuda",
18
- "no_cache": false,
19
- "limit": null,
20
- "bootstrap_iters": 100000,
21
- "description_dict": {}
22
- }
23
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evals/arc/arc_hr-llama-7B.json DELETED
@@ -1,23 +0,0 @@
1
- {
2
- "results": {
3
- "arc_hr": {
4
- "acc": 0.2754491017964072,
5
- "acc_stderr": 0.01307174925264165,
6
- "acc_norm": 0.330196749358426,
7
- "acc_norm_stderr": 0.013760638974726852
8
- }
9
- },
10
- "versions": {
11
- "arc_hr": 0
12
- },
13
- "config": {
14
- "model": "hf-auto",
15
- "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
16
- "batch_size": 1,
17
- "device": "cuda",
18
- "no_cache": false,
19
- "limit": null,
20
- "bootstrap_iters": 100000,
21
- "description_dict": {}
22
- }
23
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evals/arc/arc_hu-bloom-7b1.json DELETED
@@ -1,23 +0,0 @@
1
- {
2
- "results": {
3
- "arc_hu": {
4
- "acc": 0.1969178082191781,
5
- "acc_stderr": 0.011640913614197496,
6
- "acc_norm": 0.2585616438356164,
7
- "acc_norm_stderr": 0.0128169339627777
8
- }
9
- },
10
- "versions": {
11
- "arc_hu": 0
12
- },
13
- "config": {
14
- "model": "hf-auto",
15
- "model_args": "pretrained=bigscience/bloom-7b1",
16
- "batch_size": 1,
17
- "device": "cuda",
18
- "no_cache": false,
19
- "limit": null,
20
- "bootstrap_iters": 100000,
21
- "description_dict": {}
22
- }
23
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evals/arc/arc_hu-llama-7B.json DELETED
@@ -1,23 +0,0 @@
1
- {
2
- "results": {
3
- "arc_hu": {
4
- "acc": 0.2517123287671233,
5
- "acc_stderr": 0.012704310825494622,
6
- "acc_norm": 0.2979452054794521,
7
- "acc_norm_stderr": 0.013388079339102703
8
- }
9
- },
10
- "versions": {
11
- "arc_hu": 0
12
- },
13
- "config": {
14
- "model": "hf-auto",
15
- "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
16
- "batch_size": 1,
17
- "device": "cuda",
18
- "no_cache": false,
19
- "limit": null,
20
- "bootstrap_iters": 100000,
21
- "description_dict": {}
22
- }
23
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evals/arc/arc_hy-bloom-7b1.json DELETED
@@ -1,23 +0,0 @@
1
- {
2
- "results": {
3
- "arc_hy": {
4
- "acc": 0.21181818181818182,
5
- "acc_stderr": 0.01232525683396216,
6
- "acc_norm": 0.26181818181818184,
7
- "acc_norm_stderr": 0.013261197012809796
8
- }
9
- },
10
- "versions": {
11
- "arc_hy": 0
12
- },
13
- "config": {
14
- "model": "hf-auto",
15
- "model_args": "pretrained=bigscience/bloom-7b1",
16
- "batch_size": 1,
17
- "device": "cuda",
18
- "no_cache": false,
19
- "limit": null,
20
- "bootstrap_iters": 100000,
21
- "description_dict": {}
22
- }
23
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evals/arc/arc_hy-llama-7B.json DELETED
@@ -1,23 +0,0 @@
1
- {
2
- "results": {
3
- "arc_hy": {
4
- "acc": 0.19454545454545455,
5
- "acc_stderr": 0.011940766785664334,
6
- "acc_norm": 0.2718181818181818,
7
- "acc_norm_stderr": 0.013420241182110736
8
- }
9
- },
10
- "versions": {
11
- "arc_hy": 0
12
- },
13
- "config": {
14
- "model": "hf-auto",
15
- "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
16
- "batch_size": 1,
17
- "device": "cuda",
18
- "no_cache": false,
19
- "limit": null,
20
- "bootstrap_iters": 100000,
21
- "description_dict": {}
22
- }
23
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evals/arc/arc_id-bloom-7b1.json DELETED
@@ -1,23 +0,0 @@
1
- {
2
- "results": {
3
- "arc_id": {
4
- "acc": 0.3128205128205128,
5
- "acc_stderr": 0.013560492090917607,
6
- "acc_norm": 0.3598290598290598,
7
- "acc_norm_stderr": 0.014037469945597791
8
- }
9
- },
10
- "versions": {
11
- "arc_id": 0
12
- },
13
- "config": {
14
- "model": "hf-auto",
15
- "model_args": "pretrained=bigscience/bloom-7b1",
16
- "batch_size": 1,
17
- "device": "cuda",
18
- "no_cache": false,
19
- "limit": null,
20
- "bootstrap_iters": 100000,
21
- "description_dict": {}
22
- }
23
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evals/arc/arc_id-llama-7B.json DELETED
@@ -1,23 +0,0 @@
1
- {
2
- "results": {
3
- "arc_id": {
4
- "acc": 0.19316239316239317,
5
- "acc_stderr": 0.011546413314069014,
6
- "acc_norm": 0.26666666666666666,
7
- "acc_norm_stderr": 0.012933850109759573
8
- }
9
- },
10
- "versions": {
11
- "arc_id": 0
12
- },
13
- "config": {
14
- "model": "hf-auto",
15
- "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
16
- "batch_size": 1,
17
- "device": "cuda",
18
- "no_cache": false,
19
- "limit": null,
20
- "bootstrap_iters": 100000,
21
- "description_dict": {}
22
- }
23
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evals/arc/arc_it-bloom-7b1.json DELETED
@@ -1,23 +0,0 @@
1
- {
2
- "results": {
3
- "arc_it": {
4
- "acc": 0.24037639007698888,
5
- "acc_stderr": 0.01250327289928353,
6
- "acc_norm": 0.28999144568006846,
7
- "acc_norm_stderr": 0.01327709194338097
8
- }
9
- },
10
- "versions": {
11
- "arc_it": 0
12
- },
13
- "config": {
14
- "model": "hf-auto",
15
- "model_args": "pretrained=bigscience/bloom-7b1",
16
- "batch_size": 1,
17
- "device": "cuda",
18
- "no_cache": false,
19
- "limit": null,
20
- "bootstrap_iters": 100000,
21
- "description_dict": {}
22
- }
23
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evals/arc/arc_it-llama-7B.json DELETED
@@ -1,23 +0,0 @@
1
- {
2
- "results": {
3
- "arc_it": {
4
- "acc": 0.31736526946107785,
5
- "acc_stderr": 0.013619227292898307,
6
- "acc_norm": 0.3575705731394354,
7
- "acc_norm_stderr": 0.014024008839912006
8
- }
9
- },
10
- "versions": {
11
- "arc_it": 0
12
- },
13
- "config": {
14
- "model": "hf-auto",
15
- "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
16
- "batch_size": 1,
17
- "device": "cuda",
18
- "no_cache": false,
19
- "limit": null,
20
- "bootstrap_iters": 100000,
21
- "description_dict": {}
22
- }
23
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evals/arc/arc_kn-bloom-7b1.json DELETED
@@ -1,23 +0,0 @@
1
- {
2
- "results": {
3
- "arc_kn": {
4
- "acc": 0.2221254355400697,
5
- "acc_stderr": 0.012273607270054452,
6
- "acc_norm": 0.24738675958188153,
7
- "acc_norm_stderr": 0.012740675198098838
8
- }
9
- },
10
- "versions": {
11
- "arc_kn": 0
12
- },
13
- "config": {
14
- "model": "hf-auto",
15
- "model_args": "pretrained=bigscience/bloom-7b1",
16
- "batch_size": 1,
17
- "device": "cuda",
18
- "no_cache": false,
19
- "limit": null,
20
- "bootstrap_iters": 100000,
21
- "description_dict": {}
22
- }
23
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evals/arc/arc_kn-llama-7B.json DELETED
@@ -1,23 +0,0 @@
1
- {
2
- "results": {
3
- "arc_kn": {
4
- "acc": 0.20470383275261325,
5
- "acc_stderr": 0.011913674295957856,
6
- "acc_norm": 0.24738675958188153,
7
- "acc_norm_stderr": 0.012740675198098834
8
- }
9
- },
10
- "versions": {
11
- "arc_kn": 0
12
- },
13
- "config": {
14
- "model": "hf-auto",
15
- "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
16
- "batch_size": 1,
17
- "device": "cuda",
18
- "no_cache": false,
19
- "limit": null,
20
- "bootstrap_iters": 100000,
21
- "description_dict": {}
22
- }
23
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evals/arc/arc_ml-bloom-7b1.json DELETED
@@ -1,23 +0,0 @@
1
- {
2
- "results": {
3
- "arc_ml": {
4
- "acc": 0.2075306479859895,
5
- "acc_stderr": 0.01200575665793095,
6
- "acc_norm": 0.2635726795096322,
7
- "acc_norm_stderr": 0.013042844591075362
8
- }
9
- },
10
- "versions": {
11
- "arc_ml": 0
12
- },
13
- "config": {
14
- "model": "hf-auto",
15
- "model_args": "pretrained=bigscience/bloom-7b1",
16
- "batch_size": 1,
17
- "device": "cuda",
18
- "no_cache": false,
19
- "limit": null,
20
- "bootstrap_iters": 100000,
21
- "description_dict": {}
22
- }
23
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evals/arc/arc_ml-llama-7B.json DELETED
@@ -1,23 +0,0 @@
1
- {
2
- "results": {
3
- "arc_ml": {
4
- "acc": 0.21628721541155868,
5
- "acc_stderr": 0.012188522634632977,
6
- "acc_norm": 0.27845884413309985,
7
- "acc_norm_stderr": 0.013269918016014967
8
- }
9
- },
10
- "versions": {
11
- "arc_ml": 0
12
- },
13
- "config": {
14
- "model": "hf-auto",
15
- "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
16
- "batch_size": 1,
17
- "device": "cuda",
18
- "no_cache": false,
19
- "limit": null,
20
- "bootstrap_iters": 100000,
21
- "description_dict": {}
22
- }
23
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evals/arc/arc_mr-bloom-7b1.json DELETED
@@ -1,23 +0,0 @@
1
- {
2
- "results": {
3
- "arc_mr": {
4
- "acc": 0.23376623376623376,
5
- "acc_stderr": 0.012458582396003653,
6
- "acc_norm": 0.2727272727272727,
7
- "acc_norm_stderr": 0.013110221561502926
8
- }
9
- },
10
- "versions": {
11
- "arc_mr": 0
12
- },
13
- "config": {
14
- "model": "hf-auto",
15
- "model_args": "pretrained=bigscience/bloom-7b1",
16
- "batch_size": 1,
17
- "device": "cuda",
18
- "no_cache": false,
19
- "limit": null,
20
- "bootstrap_iters": 100000,
21
- "description_dict": {}
22
- }
23
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evals/arc/arc_mr-llama-7B.json DELETED
@@ -1,23 +0,0 @@
1
- {
2
- "results": {
3
- "arc_mr": {
4
- "acc": 0.2051948051948052,
5
- "acc_stderr": 0.011888050053276677,
6
- "acc_norm": 0.2545454545454545,
7
- "acc_norm_stderr": 0.012823020964319998
8
- }
9
- },
10
- "versions": {
11
- "arc_mr": 0
12
- },
13
- "config": {
14
- "model": "hf-auto",
15
- "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
16
- "batch_size": 1,
17
- "device": "cuda",
18
- "no_cache": false,
19
- "limit": null,
20
- "bootstrap_iters": 100000,
21
- "description_dict": {}
22
- }
23
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evals/arc/arc_ne-bloom-7b1.json DELETED
@@ -1,23 +0,0 @@
1
- {
2
- "results": {
3
- "arc_ne": {
4
- "acc": 0.21300256629597947,
5
- "acc_stderr": 0.01198002307808546,
6
- "acc_norm": 0.223267750213858,
7
- "acc_norm_stderr": 0.012185048029719049
8
- }
9
- },
10
- "versions": {
11
- "arc_ne": 0
12
- },
13
- "config": {
14
- "model": "hf-auto",
15
- "model_args": "pretrained=bigscience/bloom-7b1",
16
- "batch_size": 1,
17
- "device": "cuda",
18
- "no_cache": false,
19
- "limit": null,
20
- "bootstrap_iters": 100000,
21
- "description_dict": {}
22
- }
23
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evals/arc/arc_ne-llama-7B.json DELETED
@@ -1,23 +0,0 @@
1
- {
2
- "results": {
3
- "arc_ne": {
4
- "acc": 0.2172797262617622,
5
- "acc_stderr": 0.012066782166932105,
6
- "acc_norm": 0.24294268605645852,
7
- "acc_norm_stderr": 0.012548588352773893
8
- }
9
- },
10
- "versions": {
11
- "arc_ne": 0
12
- },
13
- "config": {
14
- "model": "hf-auto",
15
- "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
16
- "batch_size": 1,
17
- "device": "cuda",
18
- "no_cache": false,
19
- "limit": null,
20
- "bootstrap_iters": 100000,
21
- "description_dict": {}
22
- }
23
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evals/arc/arc_nl_Llama-2-7b-chat-hf.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "arc_nl": {
4
+ "acc": 0.3609923011120616,
5
+ "acc_stderr": 0.014053373664144792,
6
+ "acc_norm": 0.3618477331052181,
7
+ "acc_norm_stderr": 0.014060593893704966
8
+ }
9
+ },
10
+ "versions": {
11
+ "arc_nl": 0
12
+ },
13
+ "config": {
14
+ "model": "hf-auto",
15
+ "model_args": "pretrained=meta-llama/Llama-2-7b-chat-hf,use_accelerate=True,device_map_option=auto,dtype=bfloat16",
16
+ "batch_size": 8,
17
+ "device": "cuda",
18
+ "no_cache": false,
19
+ "limit": null,
20
+ "bootstrap_iters": 100000,
21
+ "description_dict": {}
22
+ }
23
+ }
evals/{mmlu/mmlu_gu-bloom-7b1.json → arc/arc_nl_Llama-2-7b-hf.json} RENAMED
@@ -1,19 +1,19 @@
1
  {
2
  "results": {
3
- "mmlu_gu": {
4
- "acc": 0.24933390631714655,
5
- "acc_stderr": 0.004010971174274014,
6
- "acc_norm": 0.26566394499355395,
7
- "acc_norm_stderr": 0.004094955673385403
8
  }
9
  },
10
  "versions": {
11
- "mmlu_gu": 0
12
  },
13
  "config": {
14
  "model": "hf-auto",
15
- "model_args": "pretrained=bigscience/bloom-7b1",
16
- "batch_size": 1,
17
  "device": "cuda",
18
  "no_cache": false,
19
  "limit": null,
 
1
  {
2
  "results": {
3
+ "arc_nl": {
4
+ "acc": 0.33704020530367834,
5
+ "acc_stderr": 0.013831300903580639,
6
+ "acc_norm": 0.3567151411462789,
7
+ "acc_norm_stderr": 0.014016546277185005
8
  }
9
  },
10
  "versions": {
11
+ "arc_nl": 0
12
  },
13
  "config": {
14
  "model": "hf-auto",
15
+ "model_args": "pretrained=meta-llama/Llama-2-7b-hf,use_accelerate=True,device_map_option=auto,dtype=bfloat16",
16
+ "batch_size": 8,
17
  "device": "cuda",
18
  "no_cache": false,
19
  "limit": null,
evals/arc/{arc_ar-llama-7B.json → arc_nl_Mistral-7B-v0.1.json} RENAMED
@@ -1,19 +1,19 @@
1
  {
2
  "results": {
3
- "arc_ar": {
4
- "acc": 0.19760479041916168,
5
- "acc_stderr": 0.011651221980953499,
6
- "acc_norm": 0.24636441402908468,
7
- "acc_norm_stderr": 0.012608059960468694
8
  }
9
  },
10
  "versions": {
11
- "arc_ar": 0
12
  },
13
  "config": {
14
  "model": "hf-auto",
15
- "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
16
- "batch_size": 1,
17
  "device": "cuda",
18
  "no_cache": false,
19
  "limit": null,
 
1
  {
2
  "results": {
3
+ "arc_nl": {
4
+ "acc": 0.42087254063301965,
5
+ "acc_stderr": 0.014445778557368833,
6
+ "acc_norm": 0.4294268605645851,
7
+ "acc_norm_stderr": 0.014483677397351059
8
  }
9
  },
10
  "versions": {
11
+ "arc_nl": 0
12
  },
13
  "config": {
14
  "model": "hf-auto",
15
+ "model_args": "pretrained=mistralai/Mistral-7B-v0.1,use_accelerate=True,device_map_option=auto,dtype=bfloat16",
16
+ "batch_size": 8,
17
  "device": "cuda",
18
  "no_cache": false,
19
  "limit": null,
evals/arc/{arc_bn-bloom-7b1.json → arc_nl_zephyr-7b-beta.json} RENAMED
@@ -1,19 +1,19 @@
1
  {
2
  "results": {
3
- "arc_bn": {
4
- "acc": 0.22412318220701455,
5
- "acc_stderr": 0.012201644195165715,
6
- "acc_norm": 0.2617621899059025,
7
- "acc_norm_stderr": 0.012862641889254466
8
  }
9
  },
10
  "versions": {
11
- "arc_bn": 0
12
  },
13
  "config": {
14
  "model": "hf-auto",
15
- "model_args": "pretrained=bigscience/bloom-7b1",
16
- "batch_size": 1,
17
  "device": "cuda",
18
  "no_cache": false,
19
  "limit": null,
 
1
  {
2
  "results": {
3
+ "arc_nl": {
4
+ "acc": 0.43798118049615054,
5
+ "acc_stderr": 0.01451716231691793,
6
+ "acc_norm": 0.4328485885372113,
7
+ "acc_norm_stderr": 0.01449759923259859
8
  }
9
  },
10
  "versions": {
11
+ "arc_nl": 0
12
  },
13
  "config": {
14
  "model": "hf-auto",
15
+ "model_args": "pretrained=HuggingFaceH4/zephyr-7b-beta,use_accelerate=True,device_map_option=auto,dtype=bfloat16",
16
+ "batch_size": 8,
17
  "device": "cuda",
18
  "no_cache": false,
19
  "limit": null,
evals/arc/arc_pt-bloom-7b1.json DELETED
@@ -1,23 +0,0 @@
1
- {
2
- "results": {
3
- "arc_pt": {
4
- "acc": 0.3401709401709402,
5
- "acc_stderr": 0.013856612397310694,
6
- "acc_norm": 0.4,
7
- "acc_norm_stderr": 0.014328422047021531
8
- }
9
- },
10
- "versions": {
11
- "arc_pt": 0
12
- },
13
- "config": {
14
- "model": "hf-auto",
15
- "model_args": "pretrained=bigscience/bloom-7b1",
16
- "batch_size": 1,
17
- "device": "cuda",
18
- "no_cache": false,
19
- "limit": null,
20
- "bootstrap_iters": 100000,
21
- "description_dict": {}
22
- }
23
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evals/arc/arc_pt-llama-7B.json DELETED
@@ -1,23 +0,0 @@
1
- {
2
- "results": {
3
- "arc_pt": {
4
- "acc": 0.3367521367521368,
5
- "acc_stderr": 0.01382247630777062,
6
- "acc_norm": 0.37777777777777777,
7
- "acc_norm_stderr": 0.014180244103534094
8
- }
9
- },
10
- "versions": {
11
- "arc_pt": 0
12
- },
13
- "config": {
14
- "model": "hf-auto",
15
- "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
16
- "batch_size": 1,
17
- "device": "cuda",
18
- "no_cache": false,
19
- "limit": null,
20
- "bootstrap_iters": 100000,
21
- "description_dict": {}
22
- }
23
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evals/arc/arc_ro-bloom-7b1.json DELETED
@@ -1,23 +0,0 @@
1
- {
2
- "results": {
3
- "arc_ro": {
4
- "acc": 0.2099400171379606,
5
- "acc_stderr": 0.011926921791273557,
6
- "acc_norm": 0.26906598114824337,
7
- "acc_norm_stderr": 0.012987310039914976
8
- }
9
- },
10
- "versions": {
11
- "arc_ro": 0
12
- },
13
- "config": {
14
- "model": "hf-auto",
15
- "model_args": "pretrained=bigscience/bloom-7b1",
16
- "batch_size": 1,
17
- "device": "cuda",
18
- "no_cache": false,
19
- "limit": null,
20
- "bootstrap_iters": 100000,
21
- "description_dict": {}
22
- }
23
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evals/arc/arc_ro-llama-7B.json DELETED
@@ -1,23 +0,0 @@
1
- {
2
- "results": {
3
- "arc_ro": {
4
- "acc": 0.30077120822622105,
5
- "acc_stderr": 0.013430077114209907,
6
- "acc_norm": 0.32390745501285345,
7
- "acc_norm_stderr": 0.013704533924425027
8
- }
9
- },
10
- "versions": {
11
- "arc_ro": 0
12
- },
13
- "config": {
14
- "model": "hf-auto",
15
- "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
16
- "batch_size": 1,
17
- "device": "cuda",
18
- "no_cache": false,
19
- "limit": null,
20
- "bootstrap_iters": 100000,
21
- "description_dict": {}
22
- }
23
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evals/arc/arc_ru-bloom-7b1.json DELETED
@@ -1,23 +0,0 @@
1
- {
2
- "results": {
3
- "arc_ru": {
4
- "acc": 0.21043627031650983,
5
- "acc_stderr": 0.01192703439080346,
6
- "acc_norm": 0.2754491017964072,
7
- "acc_norm_stderr": 0.01307174925264165
8
- }
9
- },
10
- "versions": {
11
- "arc_ru": 0
12
- },
13
- "config": {
14
- "model": "hf-auto",
15
- "model_args": "pretrained=bigscience/bloom-7b1",
16
- "batch_size": 1,
17
- "device": "cuda",
18
- "no_cache": false,
19
- "limit": null,
20
- "bootstrap_iters": 100000,
21
- "description_dict": {}
22
- }
23
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evals/arc/arc_ru-llama-7B.json DELETED
@@ -1,23 +0,0 @@
1
- {
2
- "results": {
3
- "arc_ru": {
4
- "acc": 0.2934131736526946,
5
- "acc_stderr": 0.013322973103306575,
6
- "acc_norm": 0.32078699743370404,
7
- "acc_norm_stderr": 0.013658089444975752
8
- }
9
- },
10
- "versions": {
11
- "arc_ru": 0
12
- },
13
- "config": {
14
- "model": "hf-auto",
15
- "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
16
- "batch_size": 1,
17
- "device": "cuda",
18
- "no_cache": false,
19
- "limit": null,
20
- "bootstrap_iters": 100000,
21
- "description_dict": {}
22
- }
23
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evals/arc/arc_sk-bloom-7b1.json DELETED
@@ -1,23 +0,0 @@
1
- {
2
- "results": {
3
- "arc_sk": {
4
- "acc": 0.20359281437125748,
5
- "acc_stderr": 0.011782227020010716,
6
- "acc_norm": 0.24893071000855432,
7
- "acc_norm_stderr": 0.012651960282598879
8
- }
9
- },
10
- "versions": {
11
- "arc_sk": 0
12
- },
13
- "config": {
14
- "model": "hf-auto",
15
- "model_args": "pretrained=bigscience/bloom-7b1",
16
- "batch_size": 1,
17
- "device": "cuda",
18
- "no_cache": false,
19
- "limit": null,
20
- "bootstrap_iters": 100000,
21
- "description_dict": {}
22
- }
23
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evals/arc/arc_sk-llama-7B.json DELETED
@@ -1,23 +0,0 @@
1
- {
2
- "results": {
3
- "arc_sk": {
4
- "acc": 0.23609923011120615,
5
- "acc_stderr": 0.012426371635795894,
6
- "acc_norm": 0.28999144568006846,
7
- "acc_norm_stderr": 0.013277091943380979
8
- }
9
- },
10
- "versions": {
11
- "arc_sk": 0
12
- },
13
- "config": {
14
- "model": "hf-auto",
15
- "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
16
- "batch_size": 1,
17
- "device": "cuda",
18
- "no_cache": false,
19
- "limit": null,
20
- "bootstrap_iters": 100000,
21
- "description_dict": {}
22
- }
23
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evals/arc/arc_sr-bloom-7b1.json DELETED
@@ -1,23 +0,0 @@
1
- {
2
- "results": {
3
- "arc_sr": {
4
- "acc": 0.2172797262617622,
5
- "acc_stderr": 0.012066782166932079,
6
- "acc_norm": 0.25149700598802394,
7
- "acc_norm_stderr": 0.01269526466186626
8
- }
9
- },
10
- "versions": {
11
- "arc_sr": 0
12
- },
13
- "config": {
14
- "model": "hf-auto",
15
- "model_args": "pretrained=bigscience/bloom-7b1",
16
- "batch_size": 1,
17
- "device": "cuda",
18
- "no_cache": false,
19
- "limit": null,
20
- "bootstrap_iters": 100000,
21
- "description_dict": {}
22
- }
23
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evals/arc/arc_sr-llama-7B.json DELETED
@@ -1,23 +0,0 @@
1
- {
2
- "results": {
3
- "arc_sr": {
4
- "acc": 0.25748502994011974,
5
- "acc_stderr": 0.012794024494042348,
6
- "acc_norm": 0.30795551753635586,
7
- "acc_norm_stderr": 0.013507954174822524
8
- }
9
- },
10
- "versions": {
11
- "arc_sr": 0
12
- },
13
- "config": {
14
- "model": "hf-auto",
15
- "model_args": "pretrained=/sensei-fs/users/daclai/uoChatGPT/llama-7B",
16
- "batch_size": 1,
17
- "device": "cuda",
18
- "no_cache": false,
19
- "limit": null,
20
- "bootstrap_iters": 100000,
21
- "description_dict": {}
22
- }
23
- }