Spaces:
Running
Running
update eq-bench
Browse files- app.py +5 -0
- benchmark_results.csv +11 -0
- metadata.json +7 -1
app.py
CHANGED
@@ -64,6 +64,8 @@ with demo:
|
|
64 |
def get_params(model_name):
|
65 |
if model_name in metadata:
|
66 |
return metadata[model_name]
|
|
|
|
|
67 |
return numpy.nan
|
68 |
|
69 |
|
@@ -77,6 +79,9 @@ with demo:
|
|
77 |
# change value of column to nan
|
78 |
leaderboard_df["Benchmark Score"] = leaderboard_df["Benchmark Score"].replace('FAILED', numpy.nan)
|
79 |
|
|
|
|
|
|
|
80 |
# set datatype of column
|
81 |
leaderboard_df["Benchmark Score"] = leaderboard_df["Benchmark Score"].astype(float)
|
82 |
leaderboard_df["Num Questions Parseable"] = leaderboard_df["Num Questions Parseable"].astype(float)
|
|
|
64 |
def get_params(model_name):
|
65 |
if model_name in metadata:
|
66 |
return metadata[model_name]
|
67 |
+
else:
|
68 |
+
print(model_name)
|
69 |
return numpy.nan
|
70 |
|
71 |
|
|
|
79 |
# change value of column to nan
|
80 |
leaderboard_df["Benchmark Score"] = leaderboard_df["Benchmark Score"].replace('FAILED', numpy.nan)
|
81 |
|
82 |
+
#scale Benchmark Score by Num Questions Parseable*171
|
83 |
+
leaderboard_df["Benchmark Score"] = leaderboard_df["Benchmark Score"].astype(float) * ((leaderboard_df["Num Questions Parseable"].astype(float) / 171))
|
84 |
+
|
85 |
# set datatype of column
|
86 |
leaderboard_df["Benchmark Score"] = leaderboard_df["Benchmark Score"].astype(float)
|
87 |
leaderboard_df["Num Questions Parseable"] = leaderboard_df["Num Questions Parseable"].astype(float)
|
benchmark_results.csv
CHANGED
@@ -34,6 +34,7 @@ openchat-gemma,2024-06-19 10:19:44,,openchat/openchat-3.5-0106-gemma,,,59.93,eq-
|
|
34 |
Nous-Hermes-2-SOLAR-10.7B,2024-06-19 10:27:36,,NousResearch/Nous-Hermes-2-SOLAR-10.7B,,,48.22,eq-bench_v2_pl,169.0,1,transformers, ,,
|
35 |
SOLAR-10.7B-Instruct-v1.0,2024-06-19 10:43:47,,upstage/SOLAR-10.7B-Instruct-v1.0,,,57.57,eq-bench_v2_pl,164.0,1,transformers, ,,
|
36 |
Qwen2-7B-Instruct,2024-06-19 10:46:52,,Qwen/Qwen2-7B-Instruct,,,53.08,eq-bench_v2_pl,171.0,1,transformers, ,,
|
|
|
37 |
Azurro/APT3-275M-Base,2024-06-19 11:36:43,,Azurro/APT3-275M-Base,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,0.0 questions were parseable (min is 83%)
|
38 |
Qwen/Qwen2-0.5B,2024-06-19 11:47:44,,Qwen/Qwen2-0.5B,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,18.0 questions were parseable (min is 83%)
|
39 |
Qwen/Qwen2-0.5B-Instruct,2024-06-19 11:51:21,,Qwen/Qwen2-0.5B-Instruct,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,125.0 questions were parseable (min is 83%)
|
@@ -141,3 +142,13 @@ mistralai/Mixtral-8x22B-v0.1,2024-06-21 20:20:37,,mistralai/Mixtral-8x22B-v0.1,,
|
|
141 |
mistralai/Mixtral-8x22B-Instruct-v0.1,2024-06-26 23:40:01,,mistralai/Mixtral-8x22B-Instruct-v0.1,,,67.63,eq-bench_v2_pl,171.0,1,transformers, ,,
|
142 |
mistralai/Mixtral-8x22B-v0.1,2024-06-27 01:17:13,,mistralai/Mixtral-8x22B-v0.1,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,50.0 questions were parseable (min is 83%)
|
143 |
alpindale/WizardLM-2-8x22B,2024-06-27 01:50:42,,alpindale/WizardLM-2-8x22B,,,69.56,eq-bench_v2_pl,171.0,1,transformers, ,,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
Nous-Hermes-2-SOLAR-10.7B,2024-06-19 10:27:36,,NousResearch/Nous-Hermes-2-SOLAR-10.7B,,,48.22,eq-bench_v2_pl,169.0,1,transformers, ,,
|
35 |
SOLAR-10.7B-Instruct-v1.0,2024-06-19 10:43:47,,upstage/SOLAR-10.7B-Instruct-v1.0,,,57.57,eq-bench_v2_pl,164.0,1,transformers, ,,
|
36 |
Qwen2-7B-Instruct,2024-06-19 10:46:52,,Qwen/Qwen2-7B-Instruct,,,53.08,eq-bench_v2_pl,171.0,1,transformers, ,,
|
37 |
+
models/gwint2,2024-06-19 11:21:15,,speakleash/Bielik-11B-v2.0-Instruct,,,68.24,eq-bench_v2_pl,171.0,1,transformers, ,,
|
38 |
Azurro/APT3-275M-Base,2024-06-19 11:36:43,,Azurro/APT3-275M-Base,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,0.0 questions were parseable (min is 83%)
|
39 |
Qwen/Qwen2-0.5B,2024-06-19 11:47:44,,Qwen/Qwen2-0.5B,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,18.0 questions were parseable (min is 83%)
|
40 |
Qwen/Qwen2-0.5B-Instruct,2024-06-19 11:51:21,,Qwen/Qwen2-0.5B-Instruct,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,125.0 questions were parseable (min is 83%)
|
|
|
142 |
mistralai/Mixtral-8x22B-Instruct-v0.1,2024-06-26 23:40:01,,mistralai/Mixtral-8x22B-Instruct-v0.1,,,67.63,eq-bench_v2_pl,171.0,1,transformers, ,,
|
143 |
mistralai/Mixtral-8x22B-v0.1,2024-06-27 01:17:13,,mistralai/Mixtral-8x22B-v0.1,,,FAILED,eq-bench_pl,FAILED,1,transformers, ,,50.0 questions were parseable (min is 83%)
|
144 |
alpindale/WizardLM-2-8x22B,2024-06-27 01:50:42,,alpindale/WizardLM-2-8x22B,,,69.56,eq-bench_v2_pl,171.0,1,transformers, ,,
|
145 |
+
Bielik_v2.2b,2024-08-24 09:54:33,,speakleash/Bielik-11B-v2.2-Instruct,,,69.05,eq-bench_v2_pl,171.0,1,transformers, ,,
|
146 |
+
Bielik_v2.1,2024-08-24 10:07:46,,speakleash/Bielik-11B-v2.1-Instruct,,,66.27,eq-bench_v2_pl,155.0,1,transformers, ,,
|
147 |
+
meta-llama/Meta-Llama-3.1-70B-Instruct,2024-08-24 21:24:39,,meta-llama/Meta-Llama-3.1-70B-Instruct,,,FAILED,eq-bench,FAILED,1,transformers, ,,`rope_scaling` must be a dictionary with two fields, `type` and `factor`, got {'factor': 8.0, 'low_freq_factor': 1.0, 'high_freq_factor': 4.0, 'original_max_position_embeddings': 8192, 'rope_type': 'llama3'}
|
148 |
+
mistralai/Mistral-Large-Instruct-2407,2024-08-24 21:51:53,,mistralai/Mistral-Large-Instruct-2407,,,78.07,eq-bench_v2_pl,171.0,1,transformers, ,,
|
149 |
+
meta-llama/Meta-Llama-3.1-70B-Instruct,2024-08-24 22:23:40,,meta-llama/Meta-Llama-3.1-70B-Instruct,,,72.53,eq-bench_v2_pl,171.0,1,transformers, ,,
|
150 |
+
meta-llama/Meta-Llama-3.1-405B-Instruct-FP8,2024-08-25 20:59:04,openai_api,meta-llama/Meta-Llama-3.1-405B-Instruct-FP8,,,77.23,eq-bench_v2_pl,171.0,1,openai,,,
|
151 |
+
gpt-3.5-turbo,2024-08-25 21:14:25,openai_api,gpt-3.5-turbo,,,57.7,eq-bench_v2_pl,171.0,1,openai,,,
|
152 |
+
gpt-4o-mini-2024-07-18,2024-08-25 21:17:34,openai_api,gpt-4o-mini-2024-07-18,,,71.15,eq-bench_v2_pl,171.0,1,openai,,,
|
153 |
+
gpt-4o-2024-08-06,2024-08-25 21:24:35,openai_api,gpt-4o-2024-08-06,,,75.15,eq-bench_v2_pl,171.0,1,openai,,,
|
154 |
+
gpt-4-turbo-2024-04-09,2024-08-25 21:31:42,openai_api,gpt-4-turbo-2024-04-09,,,77.77,eq-bench_v2_pl,164.0,1,openai,,,
|
metadata.json
CHANGED
@@ -312,5 +312,11 @@
|
|
312 |
"microsoft/Phi-3-small-8k-instruct": 7.4,
|
313 |
"ssmits/Falcon2-5.5B-Polish": 5.5,
|
314 |
"alpindale/WizardLM-2-8x22B,max_length=4096": 141,
|
315 |
-
"dreamgen/WizardLM-2-7B": 7
|
|
|
|
|
|
|
|
|
|
|
|
|
316 |
}
|
|
|
312 |
"microsoft/Phi-3-small-8k-instruct": 7.4,
|
313 |
"ssmits/Falcon2-5.5B-Polish": 5.5,
|
314 |
"alpindale/WizardLM-2-8x22B,max_length=4096": 141,
|
315 |
+
"dreamgen/WizardLM-2-7B": 7,
|
316 |
+
"mistralai/Mistral-Large-Instruct-2407": 123,
|
317 |
+
"meta-llama/Meta-Llama-3.1-70B-Instruct": 70,
|
318 |
+
"meta-llama/Meta-Llama-3.1-405B-Instruct-FP8": 405,
|
319 |
+
"speakleash/Bielik-11B-v2.0-Instruct": 11,
|
320 |
+
"speakleash/Bielik-11B-v2.2-Instruct": 11,
|
321 |
+
"speakleash/Bielik-11B-v2.1-Instruct": 11
|
322 |
}
|