mychen76 commited on
Commit
c640d09
1 Parent(s): cbbeb1c

attach result

Browse files
Files changed (1) hide show
  1. README.md +388 -1
README.md CHANGED
@@ -43,4 +43,391 @@ parameters:
43
  int8_mask: true
44
  dtype: bfloat16
45
 
46
- ```
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  int8_mask: true
44
  dtype: bfloat16
45
 
46
+ ```
47
+ ## Evaluation
48
+ https://huggingface.co/datasets/open-llm-leaderboard/details_mychen76__mistral-7b-merged-dare_6x7
49
+
50
+ ## Result
51
+ ```
52
+ {
53
+ "all": {
54
+ "acc": 0.6563139414530638,
55
+ "acc_stderr": 0.031967569574421976,
56
+ "acc_norm": 0.6562534942043537,
57
+ "acc_norm_stderr": 0.03262528877791407,
58
+ "mc1": 0.5067319461444308,
59
+ "mc1_stderr": 0.017501914492655396,
60
+ "mc2": 0.6698288226697681,
61
+ "mc2_stderr": 0.015121056875692264
62
+ },
63
+ "harness|arc:challenge|25": {
64
+ "acc": 0.6689419795221843,
65
+ "acc_stderr": 0.013752062419817837,
66
+ "acc_norm": 0.6962457337883959,
67
+ "acc_norm_stderr": 0.013438909184778768
68
+ },
69
+ "harness|hellaswag|10": {
70
+ "acc": 0.6946823341963753,
71
+ "acc_stderr": 0.004596006250433551,
72
+ "acc_norm": 0.870444134634535,
73
+ "acc_norm_stderr": 0.003351278403392407
74
+ },
75
+ "harness|hendrycksTest-abstract_algebra|5": {
76
+ "acc": 0.32,
77
+ "acc_stderr": 0.046882617226215034,
78
+ "acc_norm": 0.32,
79
+ "acc_norm_stderr": 0.046882617226215034
80
+ },
81
+ "harness|hendrycksTest-anatomy|5": {
82
+ "acc": 0.6370370370370371,
83
+ "acc_stderr": 0.041539484047423976,
84
+ "acc_norm": 0.6370370370370371,
85
+ "acc_norm_stderr": 0.041539484047423976
86
+ },
87
+ "harness|hendrycksTest-astronomy|5": {
88
+ "acc": 0.6907894736842105,
89
+ "acc_stderr": 0.037610708698674805,
90
+ "acc_norm": 0.6907894736842105,
91
+ "acc_norm_stderr": 0.037610708698674805
92
+ },
93
+ "harness|hendrycksTest-business_ethics|5": {
94
+ "acc": 0.64,
95
+ "acc_stderr": 0.04824181513244218,
96
+ "acc_norm": 0.64,
97
+ "acc_norm_stderr": 0.04824181513244218
98
+ },
99
+ "harness|hendrycksTest-clinical_knowledge|5": {
100
+ "acc": 0.7018867924528301,
101
+ "acc_stderr": 0.028152837942493864,
102
+ "acc_norm": 0.7018867924528301,
103
+ "acc_norm_stderr": 0.028152837942493864
104
+ },
105
+ "harness|hendrycksTest-college_biology|5": {
106
+ "acc": 0.7638888888888888,
107
+ "acc_stderr": 0.03551446610810826,
108
+ "acc_norm": 0.7638888888888888,
109
+ "acc_norm_stderr": 0.03551446610810826
110
+ },
111
+ "harness|hendrycksTest-college_chemistry|5": {
112
+ "acc": 0.47,
113
+ "acc_stderr": 0.050161355804659205,
114
+ "acc_norm": 0.47,
115
+ "acc_norm_stderr": 0.050161355804659205
116
+ },
117
+ "harness|hendrycksTest-college_computer_science|5": {
118
+ "acc": 0.54,
119
+ "acc_stderr": 0.05009082659620333,
120
+ "acc_norm": 0.54,
121
+ "acc_norm_stderr": 0.05009082659620333
122
+ },
123
+ "harness|hendrycksTest-college_mathematics|5": {
124
+ "acc": 0.32,
125
+ "acc_stderr": 0.04688261722621504,
126
+ "acc_norm": 0.32,
127
+ "acc_norm_stderr": 0.04688261722621504
128
+ },
129
+ "harness|hendrycksTest-college_medicine|5": {
130
+ "acc": 0.6589595375722543,
131
+ "acc_stderr": 0.036146654241808254,
132
+ "acc_norm": 0.6589595375722543,
133
+ "acc_norm_stderr": 0.036146654241808254
134
+ },
135
+ "harness|hendrycksTest-college_physics|5": {
136
+ "acc": 0.4019607843137255,
137
+ "acc_stderr": 0.048786087144669955,
138
+ "acc_norm": 0.4019607843137255,
139
+ "acc_norm_stderr": 0.048786087144669955
140
+ },
141
+ "harness|hendrycksTest-computer_security|5": {
142
+ "acc": 0.77,
143
+ "acc_stderr": 0.04229525846816507,
144
+ "acc_norm": 0.77,
145
+ "acc_norm_stderr": 0.04229525846816507
146
+ },
147
+ "harness|hendrycksTest-conceptual_physics|5": {
148
+ "acc": 0.5829787234042553,
149
+ "acc_stderr": 0.03223276266711712,
150
+ "acc_norm": 0.5829787234042553,
151
+ "acc_norm_stderr": 0.03223276266711712
152
+ },
153
+ "harness|hendrycksTest-econometrics|5": {
154
+ "acc": 0.45614035087719296,
155
+ "acc_stderr": 0.046854730419077895,
156
+ "acc_norm": 0.45614035087719296,
157
+ "acc_norm_stderr": 0.046854730419077895
158
+ },
159
+ "harness|hendrycksTest-electrical_engineering|5": {
160
+ "acc": 0.5724137931034483,
161
+ "acc_stderr": 0.04122737111370333,
162
+ "acc_norm": 0.5724137931034483,
163
+ "acc_norm_stderr": 0.04122737111370333
164
+ },
165
+ "harness|hendrycksTest-elementary_mathematics|5": {
166
+ "acc": 0.42063492063492064,
167
+ "acc_stderr": 0.025424835086924,
168
+ "acc_norm": 0.42063492063492064,
169
+ "acc_norm_stderr": 0.025424835086924
170
+ },
171
+ "harness|hendrycksTest-formal_logic|5": {
172
+ "acc": 0.49206349206349204,
173
+ "acc_stderr": 0.044715725362943486,
174
+ "acc_norm": 0.49206349206349204,
175
+ "acc_norm_stderr": 0.044715725362943486
176
+ },
177
+ "harness|hendrycksTest-global_facts|5": {
178
+ "acc": 0.39,
179
+ "acc_stderr": 0.04902071300001974,
180
+ "acc_norm": 0.39,
181
+ "acc_norm_stderr": 0.04902071300001974
182
+ },
183
+ "harness|hendrycksTest-high_school_biology|5": {
184
+ "acc": 0.7838709677419354,
185
+ "acc_stderr": 0.02341529343356853,
186
+ "acc_norm": 0.7838709677419354,
187
+ "acc_norm_stderr": 0.02341529343356853
188
+ },
189
+ "harness|hendrycksTest-high_school_chemistry|5": {
190
+ "acc": 0.5024630541871922,
191
+ "acc_stderr": 0.03517945038691063,
192
+ "acc_norm": 0.5024630541871922,
193
+ "acc_norm_stderr": 0.03517945038691063
194
+ },
195
+ "harness|hendrycksTest-high_school_computer_science|5": {
196
+ "acc": 0.72,
197
+ "acc_stderr": 0.04512608598542127,
198
+ "acc_norm": 0.72,
199
+ "acc_norm_stderr": 0.04512608598542127
200
+ },
201
+ "harness|hendrycksTest-high_school_european_history|5": {
202
+ "acc": 0.7696969696969697,
203
+ "acc_stderr": 0.0328766675860349,
204
+ "acc_norm": 0.7696969696969697,
205
+ "acc_norm_stderr": 0.0328766675860349
206
+ },
207
+ "harness|hendrycksTest-high_school_geography|5": {
208
+ "acc": 0.7929292929292929,
209
+ "acc_stderr": 0.028869778460267042,
210
+ "acc_norm": 0.7929292929292929,
211
+ "acc_norm_stderr": 0.028869778460267042
212
+ },
213
+ "harness|hendrycksTest-high_school_government_and_politics|5": {
214
+ "acc": 0.8963730569948186,
215
+ "acc_stderr": 0.02199531196364424,
216
+ "acc_norm": 0.8963730569948186,
217
+ "acc_norm_stderr": 0.02199531196364424
218
+ },
219
+ "harness|hendrycksTest-high_school_macroeconomics|5": {
220
+ "acc": 0.6641025641025641,
221
+ "acc_stderr": 0.023946724741563976,
222
+ "acc_norm": 0.6641025641025641,
223
+ "acc_norm_stderr": 0.023946724741563976
224
+ },
225
+ "harness|hendrycksTest-high_school_mathematics|5": {
226
+ "acc": 0.35185185185185186,
227
+ "acc_stderr": 0.02911661760608301,
228
+ "acc_norm": 0.35185185185185186,
229
+ "acc_norm_stderr": 0.02911661760608301
230
+ },
231
+ "harness|hendrycksTest-high_school_microeconomics|5": {
232
+ "acc": 0.7016806722689075,
233
+ "acc_stderr": 0.02971914287634286,
234
+ "acc_norm": 0.7016806722689075,
235
+ "acc_norm_stderr": 0.02971914287634286
236
+ },
237
+ "harness|hendrycksTest-high_school_physics|5": {
238
+ "acc": 0.37748344370860926,
239
+ "acc_stderr": 0.03958027231121569,
240
+ "acc_norm": 0.37748344370860926,
241
+ "acc_norm_stderr": 0.03958027231121569
242
+ },
243
+ "harness|hendrycksTest-high_school_psychology|5": {
244
+ "acc": 0.8587155963302753,
245
+ "acc_stderr": 0.014933868987028075,
246
+ "acc_norm": 0.8587155963302753,
247
+ "acc_norm_stderr": 0.014933868987028075
248
+ },
249
+ "harness|hendrycksTest-high_school_statistics|5": {
250
+ "acc": 0.5324074074074074,
251
+ "acc_stderr": 0.03402801581358966,
252
+ "acc_norm": 0.5324074074074074,
253
+ "acc_norm_stderr": 0.03402801581358966
254
+ },
255
+ "harness|hendrycksTest-high_school_us_history|5": {
256
+ "acc": 0.8480392156862745,
257
+ "acc_stderr": 0.025195658428931792,
258
+ "acc_norm": 0.8480392156862745,
259
+ "acc_norm_stderr": 0.025195658428931792
260
+ },
261
+ "harness|hendrycksTest-high_school_world_history|5": {
262
+ "acc": 0.8016877637130801,
263
+ "acc_stderr": 0.02595502084162113,
264
+ "acc_norm": 0.8016877637130801,
265
+ "acc_norm_stderr": 0.02595502084162113
266
+ },
267
+ "harness|hendrycksTest-human_aging|5": {
268
+ "acc": 0.6905829596412556,
269
+ "acc_stderr": 0.03102441174057221,
270
+ "acc_norm": 0.6905829596412556,
271
+ "acc_norm_stderr": 0.03102441174057221
272
+ },
273
+ "harness|hendrycksTest-human_sexuality|5": {
274
+ "acc": 0.8091603053435115,
275
+ "acc_stderr": 0.03446513350752598,
276
+ "acc_norm": 0.8091603053435115,
277
+ "acc_norm_stderr": 0.03446513350752598
278
+ },
279
+ "harness|hendrycksTest-international_law|5": {
280
+ "acc": 0.7933884297520661,
281
+ "acc_stderr": 0.03695980128098824,
282
+ "acc_norm": 0.7933884297520661,
283
+ "acc_norm_stderr": 0.03695980128098824
284
+ },
285
+ "harness|hendrycksTest-jurisprudence|5": {
286
+ "acc": 0.8055555555555556,
287
+ "acc_stderr": 0.038260763248848646,
288
+ "acc_norm": 0.8055555555555556,
289
+ "acc_norm_stderr": 0.038260763248848646
290
+ },
291
+ "harness|hendrycksTest-logical_fallacies|5": {
292
+ "acc": 0.7423312883435583,
293
+ "acc_stderr": 0.03436150827846917,
294
+ "acc_norm": 0.7423312883435583,
295
+ "acc_norm_stderr": 0.03436150827846917
296
+ },
297
+ "harness|hendrycksTest-machine_learning|5": {
298
+ "acc": 0.45535714285714285,
299
+ "acc_stderr": 0.047268355537191,
300
+ "acc_norm": 0.45535714285714285,
301
+ "acc_norm_stderr": 0.047268355537191
302
+ },
303
+ "harness|hendrycksTest-management|5": {
304
+ "acc": 0.8058252427184466,
305
+ "acc_stderr": 0.039166677628225836,
306
+ "acc_norm": 0.8058252427184466,
307
+ "acc_norm_stderr": 0.039166677628225836
308
+ },
309
+ "harness|hendrycksTest-marketing|5": {
310
+ "acc": 0.8717948717948718,
311
+ "acc_stderr": 0.021901905115073325,
312
+ "acc_norm": 0.8717948717948718,
313
+ "acc_norm_stderr": 0.021901905115073325
314
+ },
315
+ "harness|hendrycksTest-medical_genetics|5": {
316
+ "acc": 0.76,
317
+ "acc_stderr": 0.042923469599092816,
318
+ "acc_norm": 0.76,
319
+ "acc_norm_stderr": 0.042923469599092816
320
+ },
321
+ "harness|hendrycksTest-miscellaneous|5": {
322
+ "acc": 0.8326947637292464,
323
+ "acc_stderr": 0.013347327202920332,
324
+ "acc_norm": 0.8326947637292464,
325
+ "acc_norm_stderr": 0.013347327202920332
326
+ },
327
+ "harness|hendrycksTest-moral_disputes|5": {
328
+ "acc": 0.7283236994219653,
329
+ "acc_stderr": 0.023948512905468365,
330
+ "acc_norm": 0.7283236994219653,
331
+ "acc_norm_stderr": 0.023948512905468365
332
+ },
333
+ "harness|hendrycksTest-moral_scenarios|5": {
334
+ "acc": 0.4770949720670391,
335
+ "acc_stderr": 0.016704945740326188,
336
+ "acc_norm": 0.4770949720670391,
337
+ "acc_norm_stderr": 0.016704945740326188
338
+ },
339
+ "harness|hendrycksTest-nutrition|5": {
340
+ "acc": 0.7450980392156863,
341
+ "acc_stderr": 0.02495418432487991,
342
+ "acc_norm": 0.7450980392156863,
343
+ "acc_norm_stderr": 0.02495418432487991
344
+ },
345
+ "harness|hendrycksTest-philosophy|5": {
346
+ "acc": 0.7106109324758842,
347
+ "acc_stderr": 0.025755865922632952,
348
+ "acc_norm": 0.7106109324758842,
349
+ "acc_norm_stderr": 0.025755865922632952
350
+ },
351
+ "harness|hendrycksTest-prehistory|5": {
352
+ "acc": 0.75,
353
+ "acc_stderr": 0.02409347123262133,
354
+ "acc_norm": 0.75,
355
+ "acc_norm_stderr": 0.02409347123262133
356
+ },
357
+ "harness|hendrycksTest-professional_accounting|5": {
358
+ "acc": 0.475177304964539,
359
+ "acc_stderr": 0.02979071924382972,
360
+ "acc_norm": 0.475177304964539,
361
+ "acc_norm_stderr": 0.02979071924382972
362
+ },
363
+ "harness|hendrycksTest-professional_law|5": {
364
+ "acc": 0.46284224250325945,
365
+ "acc_stderr": 0.012734923579532069,
366
+ "acc_norm": 0.46284224250325945,
367
+ "acc_norm_stderr": 0.012734923579532069
368
+ },
369
+ "harness|hendrycksTest-professional_medicine|5": {
370
+ "acc": 0.6985294117647058,
371
+ "acc_stderr": 0.027875982114273168,
372
+ "acc_norm": 0.6985294117647058,
373
+ "acc_norm_stderr": 0.027875982114273168
374
+ },
375
+ "harness|hendrycksTest-professional_psychology|5": {
376
+ "acc": 0.6666666666666666,
377
+ "acc_stderr": 0.0190709855896875,
378
+ "acc_norm": 0.6666666666666666,
379
+ "acc_norm_stderr": 0.0190709855896875
380
+ },
381
+ "harness|hendrycksTest-public_relations|5": {
382
+ "acc": 0.6545454545454545,
383
+ "acc_stderr": 0.04554619617541054,
384
+ "acc_norm": 0.6545454545454545,
385
+ "acc_norm_stderr": 0.04554619617541054
386
+ },
387
+ "harness|hendrycksTest-security_studies|5": {
388
+ "acc": 0.726530612244898,
389
+ "acc_stderr": 0.02853556033712844,
390
+ "acc_norm": 0.726530612244898,
391
+ "acc_norm_stderr": 0.02853556033712844
392
+ },
393
+ "harness|hendrycksTest-sociology|5": {
394
+ "acc": 0.845771144278607,
395
+ "acc_stderr": 0.025538433368578337,
396
+ "acc_norm": 0.845771144278607,
397
+ "acc_norm_stderr": 0.025538433368578337
398
+ },
399
+ "harness|hendrycksTest-us_foreign_policy|5": {
400
+ "acc": 0.86,
401
+ "acc_stderr": 0.0348735088019777,
402
+ "acc_norm": 0.86,
403
+ "acc_norm_stderr": 0.0348735088019777
404
+ },
405
+ "harness|hendrycksTest-virology|5": {
406
+ "acc": 0.5180722891566265,
407
+ "acc_stderr": 0.03889951252827216,
408
+ "acc_norm": 0.5180722891566265,
409
+ "acc_norm_stderr": 0.03889951252827216
410
+ },
411
+ "harness|hendrycksTest-world_religions|5": {
412
+ "acc": 0.8362573099415205,
413
+ "acc_stderr": 0.028380919596145866,
414
+ "acc_norm": 0.8362573099415205,
415
+ "acc_norm_stderr": 0.028380919596145866
416
+ },
417
+ "harness|truthfulqa:mc|0": {
418
+ "mc1": 0.5067319461444308,
419
+ "mc1_stderr": 0.017501914492655396,
420
+ "mc2": 0.6698288226697681,
421
+ "mc2_stderr": 0.015121056875692264
422
+ },
423
+ "harness|winogrande|5": {
424
+ "acc": 0.8058405682715075,
425
+ "acc_stderr": 0.01111698339239267
426
+ },
427
+ "harness|gsm8k|5": {
428
+ "acc": 0.7134192570128886,
429
+ "acc_stderr": 0.0124548416683377
430
+ }
431
+ }
432
+
433
+ ```