Spaces:
Running
Running
Yotam Perlitz
commited on
Commit
β’
0f8e886
1
Parent(s):
ecb1e20
build app
Browse filesSigned-off-by: Yotam Perlitz <yotam.perlitz@ibm.com>
- app.py +57 -0
- assets/combined_20240704.csv +0 -0
- assets/combined_holistic.csv +825 -0
- assets/combined_holistic_20240708.csv +938 -0
- assets/livebench.csv +365 -0
app.py
CHANGED
@@ -3,6 +3,63 @@ import pandas as pd
|
|
3 |
|
4 |
st.title("βββ ββ β β β β β βποΈββοΈ benchbench-Leaderboard ποΈββοΈ")
|
5 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
# df = pd.read_csv("BAT_w_arena_10_random.csv")
|
7 |
# df = (
|
8 |
# (
|
|
|
3 |
|
4 |
st.title("βββ ββ β β β β β βποΈββοΈ benchbench-Leaderboard ποΈββοΈ")
|
5 |
|
6 |
+
import pandas as pd
|
7 |
+
from bat import Tester, Config, Benchmark, Reporter
|
8 |
+
from bat.utils import get_holistic_benchmark
|
9 |
+
|
10 |
+
|
11 |
+
cfg = Config(
|
12 |
+
exp_to_run="example",
|
13 |
+
n_models_taken_list=[0],
|
14 |
+
model_select_strategy_list=["random"],
|
15 |
+
n_exps=10,
|
16 |
+
# reference_data_path="data/combined_holistic.csv",
|
17 |
+
)
|
18 |
+
|
19 |
+
|
20 |
+
newbench_name = "livebench"
|
21 |
+
new_bench_agg_name = f"{newbench_name}_mwr"
|
22 |
+
|
23 |
+
tester = Tester(cfg=cfg)
|
24 |
+
|
25 |
+
# models_for_benchmark_scoring = tester.fetch_reference_models_names(
|
26 |
+
# reference_benchmark=get_holistic_benchmark(), n_models=20
|
27 |
+
# )
|
28 |
+
|
29 |
+
newbench = Benchmark(
|
30 |
+
pd.read_csv(f"assets/{newbench_name}.csv"),
|
31 |
+
data_source=newbench_name,
|
32 |
+
)
|
33 |
+
|
34 |
+
# newbench.add_aggragete(new_col_name=new_bench_agg_name)
|
35 |
+
# newbench_agreements = tester.all_vs_all_agreement_testing(newbench)
|
36 |
+
|
37 |
+
reporter = Reporter()
|
38 |
+
# reporter.draw_agreements(
|
39 |
+
# newbench_agreements, ref_sources=[newbench_name], scenario_sources=[newbench_name]
|
40 |
+
# )
|
41 |
+
|
42 |
+
holistic = get_holistic_benchmark()
|
43 |
+
holistic.add_aggragete(new_col_name="aggregate", agg_source_name="holistic")
|
44 |
+
|
45 |
+
allbench = newbench.extend(holistic)
|
46 |
+
allbench.clear_repeated_scenarios(source_to_keep=newbench_name)
|
47 |
+
|
48 |
+
|
49 |
+
@st.cache_data
|
50 |
+
def run_load():
|
51 |
+
return tester.all_vs_all_agreement_testing(allbench)
|
52 |
+
|
53 |
+
|
54 |
+
all_agreements = run_load()
|
55 |
+
|
56 |
+
observed_scenario = "arena_elo" # "livebench_lb"
|
57 |
+
blacklist_sources = [] # "livebench"
|
58 |
+
|
59 |
+
z_score = reporter.get_z_score(all_agreements, observed_scenario, blacklist_sources)
|
60 |
+
|
61 |
+
st.write(f"zscore of {observed_scenario}: {z_score}")
|
62 |
+
|
63 |
# df = pd.read_csv("BAT_w_arena_10_random.csv")
|
64 |
# df = (
|
65 |
# (
|
assets/combined_20240704.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
assets/combined_holistic.csv
ADDED
@@ -0,0 +1,825 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
,model,score,scenario,source,aggragated_from
|
2 |
+
0,gpt-4-turbo-2024-04-09,82.6,arena-hard,arena_hard_2404,[]
|
3 |
+
1,gpt-4-0125-preview,78.0,arena-hard,arena_hard_2404,[]
|
4 |
+
2,gemini-1.5-pro-api-preview,72.0,arena-hard,arena_hard_2404,[]
|
5 |
+
3,yi-large,63.7,arena-hard,arena_hard_2404,[]
|
6 |
+
4,claude-3-opus-20240229,60.4,arena-hard,arena_hard_2404,[]
|
7 |
+
5,glm-4,55.7,arena-hard,arena_hard_2404,[]
|
8 |
+
6,gpt-4-0314,50.0,arena-hard,arena_hard_2404,[]
|
9 |
+
7,gemini-1.5-flash-api-preview,49.6,arena-hard,arena_hard_2404,[]
|
10 |
+
8,claude-3-sonnet-20240229,46.8,arena-hard,arena_hard_2404,[]
|
11 |
+
9,claude-3-haiku-20240307,41.5,arena-hard,arena_hard_2404,[]
|
12 |
+
10,llama-3-70b-chat-hf,41.1,arena-hard,arena_hard_2404,[]
|
13 |
+
11,gpt-4-0613,37.9,arena-hard,arena_hard_2404,[]
|
14 |
+
12,mistral-large-2402,37.7,arena-hard,arena_hard_2404,[]
|
15 |
+
13,mixtral-8x22b-instruct-v0.1,36.4,arena-hard,arena_hard_2404,[]
|
16 |
+
14,qwen1.5-72b-chat,36.1,arena-hard,arena_hard_2404,[]
|
17 |
+
15,command-r-plus,33.1,arena-hard,arena_hard_2404,[]
|
18 |
+
16,mistral-medium,31.9,arena-hard,arena_hard_2404,[]
|
19 |
+
17,mistral-next,27.4,arena-hard,arena_hard_2404,[]
|
20 |
+
18,gpt-3.5-turbo-0613,24.8,arena-hard,arena_hard_2404,[]
|
21 |
+
19,claude-2.0,24.0,arena-hard,arena_hard_2404,[]
|
22 |
+
20,dbrx-instructruct,23.9,arena-hard,arena_hard_2404,[]
|
23 |
+
21,mixtral-8x7b-instruct-v0.1,23.4,arena-hard,arena_hard_2404,[]
|
24 |
+
22,gpt-3.5-turbo-0125,23.3,arena-hard,arena_hard_2404,[]
|
25 |
+
23,yi-34b-chat,23.1,arena-hard,arena_hard_2404,[]
|
26 |
+
24,starling-lm-7b-beta,23.0,arena-hard,arena_hard_2404,[]
|
27 |
+
25,claude-2.1,22.8,arena-hard,arena_hard_2404,[]
|
28 |
+
26,snorkel-mistral-pairrm-dpo,20.7,arena-hard,arena_hard_2404,[]
|
29 |
+
27,llama-3-8b-chat-hf,20.6,arena-hard,arena_hard_2404,[]
|
30 |
+
28,gpt-3.5-turbo-1106,18.9,arena-hard,arena_hard_2404,[]
|
31 |
+
29,gpt-3.5-turbo-0301,18.1,arena-hard,arena_hard_2404,[]
|
32 |
+
30,gemini-1.0-pro,17.8,arena-hard,arena_hard_2404,[]
|
33 |
+
31,snowflake-arctic-instruct,17.6,arena-hard,arena_hard_2404,[]
|
34 |
+
32,command-r,17.0,arena-hard,arena_hard_2404,[]
|
35 |
+
33,phi-3-mini-128k-instruct,15.4,arena-hard,arena_hard_2404,[]
|
36 |
+
34,tulu-2-dpo-70b,15.0,arena-hard,arena_hard_2404,[]
|
37 |
+
35,starling-lm-7b-alpha,12.8,arena-hard,arena_hard_2404,[]
|
38 |
+
36,mistral-7b-instruct,12.6,arena-hard,arena_hard_2404,[]
|
39 |
+
37,gemma-1.1-7b-it,12.1,arena-hard,arena_hard_2404,[]
|
40 |
+
38,llama-2-70b-chat-hf,11.6,arena-hard,arena_hard_2404,[]
|
41 |
+
39,vicuna-33b-v1.3,8.6,arena-hard,arena_hard_2404,[]
|
42 |
+
40,gemma-7b-it,7.5,arena-hard,arena_hard_2404,[]
|
43 |
+
41,llama-2-7b-chat-hf,4.6,arena-hard,arena_hard_2404,[]
|
44 |
+
42,gemma-1.1-2b-it,3.4,arena-hard,arena_hard_2404,[]
|
45 |
+
43,gemma-2b-it,3.0,arena-hard,arena_hard_2404,[]
|
46 |
+
0,gpt-4o-2024-05-13,64.7,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
47 |
+
1,claude-3-opus,63.5,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
48 |
+
2,gpt-4-turbo-2024-04-09,62.6,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
49 |
+
3,gemini-1.5-pro-api-0409,58.7,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
50 |
+
4,yi-large-preview,56.8,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
51 |
+
5,llama-3-70b-instruct,55.9,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
52 |
+
6,qwen-max-0428,55.8,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
53 |
+
7,claude-3-sonnet,54.0,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
54 |
+
8,reka-core-20240415,52.9,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
55 |
+
9,mammoth2-8x7b-plus,51.8,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
56 |
+
10,deepseek-v2,51.7,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
57 |
+
11,command-r-plus,51.4,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
58 |
+
12,yi-1.5-34b-chat,51.2,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
59 |
+
13,mistral-large,50.3,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
60 |
+
14,qwen1.5-72b-chat,48.3,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
61 |
+
15,mistral-medium,47.8,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
62 |
+
16,gemini-1.0-pro,46.4,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
63 |
+
17,reka-flash-20240226,46.2,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
64 |
+
18,mistral-small,46.2,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
65 |
+
19,llama-3-8b-instruct,45.6,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
66 |
+
20,command-r,45.2,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
67 |
+
21,qwen1.5-32b-chat,43.3,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
68 |
+
22,gpt-3.5-turbo-0125,43.0,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
69 |
+
23,claude-3-haiku,42.8,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
70 |
+
24,yi-34b-chat,42.6,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
71 |
+
25,mixtral-8x7b-instruct-v0.1,42.5,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
72 |
+
26,starling-lm-7b-beta,41.8,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
73 |
+
27,yi-1.5-9b-chat,40.9,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
74 |
+
28,gemma-1.1-7b-it,39.1,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
75 |
+
29,vicuna-33b-v1.3,38.7,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
76 |
+
30,llama-2-70b-chat,38.0,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
77 |
+
31,map-neo-instruct-v0.1,37.8,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
78 |
+
32,mistral-7b-instruct-v0.2,36.2,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
79 |
+
33,qwen1.5-7b-chat,35.5,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
80 |
+
34,reka-edge-20240208,32.2,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
81 |
+
35,zephyr-7b-beta,31.6,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
82 |
+
36,llama-2-7b-chat,30.8,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
83 |
+
37,yi-6b-chat,30.1,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
84 |
+
38,qwen1.5-moe-a2.7b-chat,29.1,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
85 |
+
39,gemma-1.1-2b-it,28.4,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
86 |
+
40,vicuna-7b-v1.5,27.8,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
87 |
+
41,olmo-7b-instruct,26.7,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
88 |
+
42,qwen1.5-4b-chat,24.6,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
89 |
+
43,jetmoe-8b-chat,24.3,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
90 |
+
44,mpt-7b-chat,23.8,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
91 |
+
45,llama-3-70b,54.0,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
92 |
+
46,qwen1.5-72b,41.9,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
93 |
+
47,yi-34b,47.2,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
94 |
+
48,qwen1.5-32b,41.0,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
95 |
+
49,mixtral-8x7b,40.7,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
96 |
+
50,llama-2-70b,41.6,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
97 |
+
51,qwen1.5-moe-a2.7b,33.5,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
98 |
+
52,qwen1.5-7b,33.7,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
99 |
+
53,llama-3-8b,31.7,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
100 |
+
54,mistral-7b,27.1,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
101 |
+
55,gemma-7b,32.7,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
102 |
+
56,yi-6b,30.4,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
103 |
+
57,qwen1.5-4b,23.5,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
104 |
+
58,jetmoe-8b,27.0,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
105 |
+
59,deepseek-7b,21.7,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
106 |
+
60,phi-2,21.9,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
107 |
+
61,deepseekmoe-16b,24.2,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
108 |
+
62,llama-2-7b,22.1,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
109 |
+
63,gemma-2b,22.6,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
110 |
+
64,olmo-7b,21.2,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
111 |
+
65,mpt-7b,17.4,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
112 |
+
66,gpt-4o-2024-05-13,87.9,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
113 |
+
67,claude-3-opus,88.1,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
114 |
+
68,gpt-4-turbo-2024-04-09,88.8,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
115 |
+
69,gemini-1.5-pro-api-0409,84.2,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
116 |
+
70,yi-large-preview,84.4,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
117 |
+
71,llama-3-70b-instruct,84.0,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
118 |
+
72,qwen-max-0428,86.1,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
119 |
+
73,claude-3-sonnet,81.7,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
120 |
+
74,reka-core-20240415,83.3,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
121 |
+
75,mammoth2-8x7b-plus,81.5,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
122 |
+
76,deepseek-v2,83.7,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
123 |
+
77,command-r-plus,81.5,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
124 |
+
78,yi-1.5-34b-chat,81.7,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
125 |
+
79,mistral-large,84.2,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
126 |
+
80,qwen1.5-72b-chat,84.1,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
127 |
+
81,mistral-medium,81.9,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
128 |
+
82,gemini-1.0-pro,78.9,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
129 |
+
83,reka-flash-20240226,79.8,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
130 |
+
84,mistral-small,81.2,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
131 |
+
85,llama-3-8b-instruct,75.0,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
132 |
+
86,command-r,77.0,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
133 |
+
87,qwen1.5-32b-chat,81.0,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
134 |
+
88,gpt-3.5-turbo-0125,79.7,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
135 |
+
89,claude-3-haiku,79.7,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
136 |
+
90,yi-34b-chat,80.1,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
137 |
+
91,mixtral-8x7b-instruct-v0.1,76.4,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
138 |
+
92,starling-lm-7b-beta,74.8,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
139 |
+
93,yi-1.5-9b-chat,74.2,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
140 |
+
94,gemma-1.1-7b-it,69.6,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
141 |
+
95,vicuna-33b-v1.3,66.3,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
142 |
+
96,llama-2-70b-chat,74.6,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
143 |
+
97,map-neo-instruct-v0.1,70.0,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
144 |
+
98,mistral-7b-instruct-v0.2,70.0,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
145 |
+
99,qwen1.5-7b-chat,71.4,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
146 |
+
100,reka-edge-20240208,68.5,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
147 |
+
101,zephyr-7b-beta,69.1,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
148 |
+
102,llama-2-7b-chat,61.7,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
149 |
+
103,yi-6b-chat,65.6,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
150 |
+
104,qwen1.5-moe-a2.7b-chat,69.1,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
151 |
+
105,gemma-1.1-2b-it,51.9,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
152 |
+
106,vicuna-7b-v1.5,60.3,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
153 |
+
107,olmo-7b-instruct,55.0,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
154 |
+
108,qwen1.5-4b-chat,57.2,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
155 |
+
109,jetmoe-8b-chat,51.6,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
156 |
+
110,mpt-7b-chat,43.8,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
157 |
+
111,llama-3-70b,82.2,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
158 |
+
112,qwen1.5-72b,79.5,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
159 |
+
113,yi-34b,78.3,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
160 |
+
114,qwen1.5-32b,77.6,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
161 |
+
115,mixtral-8x7b,74.0,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
162 |
+
116,llama-2-70b,73.2,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
163 |
+
117,qwen1.5-moe-a2.7b,70.2,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
164 |
+
118,qwen1.5-7b,68.2,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
165 |
+
119,llama-3-8b,65.1,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
166 |
+
120,mistral-7b,64.8,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
167 |
+
121,gemma-7b,64.7,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
168 |
+
122,yi-6b,63.1,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
169 |
+
123,qwen1.5-4b,58.2,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
170 |
+
124,jetmoe-8b,57.1,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
171 |
+
125,deepseek-7b,52.2,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
172 |
+
126,phi-2,51.9,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
173 |
+
127,deepseekmoe-16b,51.4,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
174 |
+
128,llama-2-7b,43.1,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
175 |
+
129,gemma-2b,38.9,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
176 |
+
130,olmo-7b,31.8,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
177 |
+
131,mpt-7b,30.8,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
|
178 |
+
264,gpt-4o-2024-05-13,85.4,mmlu-mixed,mixeval_240601,[]
|
179 |
+
265,claude-3-opus,83.2,mmlu-mixed,mixeval_240601,[]
|
180 |
+
266,gpt-4-turbo-2024-04-09,82.8,mmlu-mixed,mixeval_240601,[]
|
181 |
+
267,gemini-1.5-pro-api-0409,79.2,mmlu-mixed,mixeval_240601,[]
|
182 |
+
268,yi-large-preview,80.9,mmlu-mixed,mixeval_240601,[]
|
183 |
+
269,llama-3-70b-instruct,80.5,mmlu-mixed,mixeval_240601,[]
|
184 |
+
270,qwen-max-0428,80.6,mmlu-mixed,mixeval_240601,[]
|
185 |
+
271,claude-3-sonnet,74.7,mmlu-mixed,mixeval_240601,[]
|
186 |
+
272,reka-core-20240415,79.3,mmlu-mixed,mixeval_240601,[]
|
187 |
+
273,mammoth2-8x7b-plus,74.5,mmlu-mixed,mixeval_240601,[]
|
188 |
+
274,deepseek-v2,77.3,mmlu-mixed,mixeval_240601,[]
|
189 |
+
275,command-r-plus,78.9,mmlu-mixed,mixeval_240601,[]
|
190 |
+
276,yi-1.5-34b-chat,76.4,mmlu-mixed,mixeval_240601,[]
|
191 |
+
277,mistral-large,80.2,mmlu-mixed,mixeval_240601,[]
|
192 |
+
278,qwen1.5-72b-chat,80.1,mmlu-mixed,mixeval_240601,[]
|
193 |
+
279,mistral-medium,76.3,mmlu-mixed,mixeval_240601,[]
|
194 |
+
280,gemini-1.0-pro,74.9,mmlu-mixed,mixeval_240601,[]
|
195 |
+
281,reka-flash-20240226,75.4,mmlu-mixed,mixeval_240601,[]
|
196 |
+
282,mistral-small,75.2,mmlu-mixed,mixeval_240601,[]
|
197 |
+
283,llama-3-8b-instruct,71.9,mmlu-mixed,mixeval_240601,[]
|
198 |
+
284,command-r,75.0,mmlu-mixed,mixeval_240601,[]
|
199 |
+
285,qwen1.5-32b-chat,78.0,mmlu-mixed,mixeval_240601,[]
|
200 |
+
286,gpt-3.5-turbo-0125,74.5,mmlu-mixed,mixeval_240601,[]
|
201 |
+
287,claude-3-haiku,76.1,mmlu-mixed,mixeval_240601,[]
|
202 |
+
288,yi-34b-chat,73.6,mmlu-mixed,mixeval_240601,[]
|
203 |
+
289,mixtral-8x7b-instruct-v0.1,72.0,mmlu-mixed,mixeval_240601,[]
|
204 |
+
290,starling-lm-7b-beta,69.0,mmlu-mixed,mixeval_240601,[]
|
205 |
+
291,yi-1.5-9b-chat,72.6,mmlu-mixed,mixeval_240601,[]
|
206 |
+
292,gemma-1.1-7b-it,66.9,mmlu-mixed,mixeval_240601,[]
|
207 |
+
293,vicuna-33b-v1.3,59.2,mmlu-mixed,mixeval_240601,[]
|
208 |
+
294,llama-2-70b-chat,69.8,mmlu-mixed,mixeval_240601,[]
|
209 |
+
295,map-neo-instruct-v0.1,66.7,mmlu-mixed,mixeval_240601,[]
|
210 |
+
296,mistral-7b-instruct-v0.2,67.3,mmlu-mixed,mixeval_240601,[]
|
211 |
+
297,qwen1.5-7b-chat,68.7,mmlu-mixed,mixeval_240601,[]
|
212 |
+
298,reka-edge-20240208,63.6,mmlu-mixed,mixeval_240601,[]
|
213 |
+
299,zephyr-7b-beta,64.9,mmlu-mixed,mixeval_240601,[]
|
214 |
+
300,llama-2-7b-chat,59.4,mmlu-mixed,mixeval_240601,[]
|
215 |
+
301,yi-6b-chat,65.4,mmlu-mixed,mixeval_240601,[]
|
216 |
+
302,qwen1.5-moe-a2.7b-chat,69.5,mmlu-mixed,mixeval_240601,[]
|
217 |
+
303,gemma-1.1-2b-it,51.5,mmlu-mixed,mixeval_240601,[]
|
218 |
+
304,vicuna-7b-v1.5,58.7,mmlu-mixed,mixeval_240601,[]
|
219 |
+
305,olmo-7b-instruct,57.1,mmlu-mixed,mixeval_240601,[]
|
220 |
+
306,qwen1.5-4b-chat,61.4,mmlu-mixed,mixeval_240601,[]
|
221 |
+
307,jetmoe-8b-chat,58.5,mmlu-mixed,mixeval_240601,[]
|
222 |
+
308,mpt-7b-chat,37.8,mmlu-mixed,mixeval_240601,[]
|
223 |
+
309,llama-3-70b,79.8,mmlu-mixed,mixeval_240601,[]
|
224 |
+
310,qwen1.5-72b,78.8,mmlu-mixed,mixeval_240601,[]
|
225 |
+
311,yi-34b,79.3,mmlu-mixed,mixeval_240601,[]
|
226 |
+
312,qwen1.5-32b,77.2,mmlu-mixed,mixeval_240601,[]
|
227 |
+
313,mixtral-8x7b,71.6,mmlu-mixed,mixeval_240601,[]
|
228 |
+
314,llama-2-70b,70.8,mmlu-mixed,mixeval_240601,[]
|
229 |
+
315,qwen1.5-moe-a2.7b,69.4,mmlu-mixed,mixeval_240601,[]
|
230 |
+
316,qwen1.5-7b,67.0,mmlu-mixed,mixeval_240601,[]
|
231 |
+
317,llama-3-8b,69.5,mmlu-mixed,mixeval_240601,[]
|
232 |
+
318,mistral-7b,68.5,mmlu-mixed,mixeval_240601,[]
|
233 |
+
319,gemma-7b,67.4,mmlu-mixed,mixeval_240601,[]
|
234 |
+
320,yi-6b,71.2,mmlu-mixed,mixeval_240601,[]
|
235 |
+
321,qwen1.5-4b,59.6,mmlu-mixed,mixeval_240601,[]
|
236 |
+
322,jetmoe-8b,55.3,mmlu-mixed,mixeval_240601,[]
|
237 |
+
323,deepseek-7b,53.3,mmlu-mixed,mixeval_240601,[]
|
238 |
+
324,phi-2,62.5,mmlu-mixed,mixeval_240601,[]
|
239 |
+
325,deepseekmoe-16b,49.9,mmlu-mixed,mixeval_240601,[]
|
240 |
+
326,llama-2-7b,40.8,mmlu-mixed,mixeval_240601,[]
|
241 |
+
327,gemma-2b,37.4,mmlu-mixed,mixeval_240601,[]
|
242 |
+
328,olmo-7b,29.7,mmlu-mixed,mixeval_240601,[]
|
243 |
+
329,mpt-7b,30.9,mmlu-mixed,mixeval_240601,[]
|
244 |
+
594,gpt-4o-2024-05-13,57.1,mmlu-hard-mixed,mixeval_240601,[]
|
245 |
+
595,claude-3-opus,55.0,mmlu-hard-mixed,mixeval_240601,[]
|
246 |
+
596,gpt-4-turbo-2024-04-09,45.5,mmlu-hard-mixed,mixeval_240601,[]
|
247 |
+
597,gemini-1.5-pro-api-0409,44.6,mmlu-hard-mixed,mixeval_240601,[]
|
248 |
+
598,yi-large-preview,48.5,mmlu-hard-mixed,mixeval_240601,[]
|
249 |
+
599,llama-3-70b-instruct,46.3,mmlu-hard-mixed,mixeval_240601,[]
|
250 |
+
600,qwen-max-0428,41.6,mmlu-hard-mixed,mixeval_240601,[]
|
251 |
+
601,claude-3-sonnet,40.7,mmlu-hard-mixed,mixeval_240601,[]
|
252 |
+
602,reka-core-20240415,46.3,mmlu-hard-mixed,mixeval_240601,[]
|
253 |
+
603,mammoth2-8x7b-plus,41.1,mmlu-hard-mixed,mixeval_240601,[]
|
254 |
+
604,deepseek-v2,42.0,mmlu-hard-mixed,mixeval_240601,[]
|
255 |
+
605,command-r-plus,42.0,mmlu-hard-mixed,mixeval_240601,[]
|
256 |
+
606,yi-1.5-34b-chat,38.1,mmlu-hard-mixed,mixeval_240601,[]
|
257 |
+
607,mistral-large,42.4,mmlu-hard-mixed,mixeval_240601,[]
|
258 |
+
608,qwen1.5-72b-chat,37.7,mmlu-hard-mixed,mixeval_240601,[]
|
259 |
+
609,mistral-medium,38.5,mmlu-hard-mixed,mixeval_240601,[]
|
260 |
+
610,gemini-1.0-pro,35.5,mmlu-hard-mixed,mixeval_240601,[]
|
261 |
+
611,reka-flash-20240226,34.6,mmlu-hard-mixed,mixeval_240601,[]
|
262 |
+
612,mistral-small,33.8,mmlu-hard-mixed,mixeval_240601,[]
|
263 |
+
613,llama-3-8b-instruct,40.7,mmlu-hard-mixed,mixeval_240601,[]
|
264 |
+
614,command-r,39.0,mmlu-hard-mixed,mixeval_240601,[]
|
265 |
+
615,qwen1.5-32b-chat,29.9,mmlu-hard-mixed,mixeval_240601,[]
|
266 |
+
616,gpt-3.5-turbo-0125,35.1,mmlu-hard-mixed,mixeval_240601,[]
|
267 |
+
617,claude-3-haiku,30.7,mmlu-hard-mixed,mixeval_240601,[]
|
268 |
+
618,yi-34b-chat,29.9,mmlu-hard-mixed,mixeval_240601,[]
|
269 |
+
619,mixtral-8x7b-instruct-v0.1,37.2,mmlu-hard-mixed,mixeval_240601,[]
|
270 |
+
620,starling-lm-7b-beta,34.2,mmlu-hard-mixed,mixeval_240601,[]
|
271 |
+
621,yi-1.5-9b-chat,36.8,mmlu-hard-mixed,mixeval_240601,[]
|
272 |
+
622,gemma-1.1-7b-it,39.0,mmlu-hard-mixed,mixeval_240601,[]
|
273 |
+
623,vicuna-33b-v1.3,39.4,mmlu-hard-mixed,mixeval_240601,[]
|
274 |
+
624,llama-2-70b-chat,27.7,mmlu-hard-mixed,mixeval_240601,[]
|
275 |
+
625,map-neo-instruct-v0.1,32.5,mmlu-hard-mixed,mixeval_240601,[]
|
276 |
+
626,mistral-7b-instruct-v0.2,29.4,mmlu-hard-mixed,mixeval_240601,[]
|
277 |
+
627,qwen1.5-7b-chat,29.0,mmlu-hard-mixed,mixeval_240601,[]
|
278 |
+
628,reka-edge-20240208,26.4,mmlu-hard-mixed,mixeval_240601,[]
|
279 |
+
629,zephyr-7b-beta,24.2,mmlu-hard-mixed,mixeval_240601,[]
|
280 |
+
630,llama-2-7b-chat,30.3,mmlu-hard-mixed,mixeval_240601,[]
|
281 |
+
631,yi-6b-chat,26.8,mmlu-hard-mixed,mixeval_240601,[]
|
282 |
+
632,qwen1.5-moe-a2.7b-chat,26.8,mmlu-hard-mixed,mixeval_240601,[]
|
283 |
+
633,gemma-1.1-2b-it,30.3,mmlu-hard-mixed,mixeval_240601,[]
|
284 |
+
634,vicuna-7b-v1.5,23.4,mmlu-hard-mixed,mixeval_240601,[]
|
285 |
+
635,olmo-7b-instruct,27.3,mmlu-hard-mixed,mixeval_240601,[]
|
286 |
+
636,qwen1.5-4b-chat,17.3,mmlu-hard-mixed,mixeval_240601,[]
|
287 |
+
637,jetmoe-8b-chat,25.5,mmlu-hard-mixed,mixeval_240601,[]
|
288 |
+
638,mpt-7b-chat,24.7,mmlu-hard-mixed,mixeval_240601,[]
|
289 |
+
639,llama-3-70b,39.8,mmlu-hard-mixed,mixeval_240601,[]
|
290 |
+
640,qwen1.5-72b,42.4,mmlu-hard-mixed,mixeval_240601,[]
|
291 |
+
641,yi-34b,42.4,mmlu-hard-mixed,mixeval_240601,[]
|
292 |
+
642,qwen1.5-32b,37.2,mmlu-hard-mixed,mixeval_240601,[]
|
293 |
+
643,mixtral-8x7b,34.6,mmlu-hard-mixed,mixeval_240601,[]
|
294 |
+
644,llama-2-70b,29.0,mmlu-hard-mixed,mixeval_240601,[]
|
295 |
+
645,qwen1.5-moe-a2.7b,30.7,mmlu-hard-mixed,mixeval_240601,[]
|
296 |
+
646,qwen1.5-7b,28.6,mmlu-hard-mixed,mixeval_240601,[]
|
297 |
+
647,llama-3-8b,38.5,mmlu-hard-mixed,mixeval_240601,[]
|
298 |
+
648,mistral-7b,27.7,mmlu-hard-mixed,mixeval_240601,[]
|
299 |
+
649,gemma-7b,28.1,mmlu-hard-mixed,mixeval_240601,[]
|
300 |
+
650,yi-6b,37.2,mmlu-hard-mixed,mixeval_240601,[]
|
301 |
+
651,qwen1.5-4b,22.9,mmlu-hard-mixed,mixeval_240601,[]
|
302 |
+
652,jetmoe-8b,27.3,mmlu-hard-mixed,mixeval_240601,[]
|
303 |
+
653,deepseek-7b,26.4,mmlu-hard-mixed,mixeval_240601,[]
|
304 |
+
654,phi-2,29.0,mmlu-hard-mixed,mixeval_240601,[]
|
305 |
+
655,deepseekmoe-16b,30.7,mmlu-hard-mixed,mixeval_240601,[]
|
306 |
+
656,llama-2-7b,24.7,mmlu-hard-mixed,mixeval_240601,[]
|
307 |
+
657,gemma-2b,27.3,mmlu-hard-mixed,mixeval_240601,[]
|
308 |
+
658,olmo-7b,25.1,mmlu-hard-mixed,mixeval_240601,[]
|
309 |
+
659,mpt-7b,24.2,mmlu-hard-mixed,mixeval_240601,[]
|
310 |
+
593,gpt-4-0314,0.57,agieval,BLZ_240312,[]
|
311 |
+
594,gpt-4-0613,0.57,agieval,BLZ_240312,[]
|
312 |
+
596,claude-1,0.49700000000000005,agieval,BLZ_240312,[]
|
313 |
+
601,mixtral-8x7b-instruct-v0.1,0.45299999999999996,agieval,BLZ_240312,[]
|
314 |
+
602,yi-34b-chat,0.508,agieval,BLZ_240312,[]
|
315 |
+
605,gpt-3.5-turbo-0314,0.43200000000000005,agieval,BLZ_240312,[]
|
316 |
+
608,vicuna-33b,0.373,agieval,BLZ_240312,[]
|
317 |
+
609,starling-lm-7b-alpha,0.401,agieval,BLZ_240312,[]
|
318 |
+
611,llama-2-70b-chat,0.45,agieval,BLZ_240312,[]
|
319 |
+
613,openhermes-2.5-mistral-7b,0.43,agieval,BLZ_240312,[]
|
320 |
+
614,openchat-3.5,0.42700000000000005,agieval,BLZ_240312,[]
|
321 |
+
617,solar-10.7b-instruct-v1.0,0.47600000000000003,agieval,BLZ_240312,[]
|
322 |
+
618,dolphin-2.2.1-mistral-7b,0.392,agieval,BLZ_240312,[]
|
323 |
+
620,zephyr-7b-beta,0.406,agieval,BLZ_240312,[]
|
324 |
+
623,llama-2-13b-chat,0.336,agieval,BLZ_240312,[]
|
325 |
+
624,vicuna-13b,0.368,agieval,BLZ_240312,[]
|
326 |
+
626,zephyr-7b-alpha,0.38,agieval,BLZ_240312,[]
|
327 |
+
627,qwen-14b-chat,0.396,agieval,BLZ_240312,[]
|
328 |
+
630,llama-2-7b-chat,0.29600000000000004,agieval,BLZ_240312,[]
|
329 |
+
632,mistral-7b-instruct-v0.1,0.335,agieval,BLZ_240312,[]
|
330 |
+
634,vicuna-7b,0.314,agieval,BLZ_240312,[]
|
331 |
+
636,chatglm3-6b,0.414,agieval,BLZ_240312,[]
|
332 |
+
643,chatglm-6b,0.325,agieval,BLZ_240312,[]
|
333 |
+
647,llama-13b,0.205,agieval,BLZ_240312,[]
|
334 |
+
886,gpt-4-1106-preview,0.977,alpacav1,BLZ_240312,[]
|
335 |
+
888,gpt-4-0314,0.9528,alpacav1,BLZ_240312,[]
|
336 |
+
889,gpt-4-0613,0.9528,alpacav1,BLZ_240312,[]
|
337 |
+
890,mistral-medium,0.9682999999999999,alpacav1,BLZ_240312,[]
|
338 |
+
891,claude-1,0.8839,alpacav1,BLZ_240312,[]
|
339 |
+
892,claude-2.0,0.9136,alpacav1,BLZ_240312,[]
|
340 |
+
893,gemini-pro-dev-api,0.7966,alpacav1,BLZ_240312,[]
|
341 |
+
894,claude-2.1,0.8708,alpacav1,BLZ_240312,[]
|
342 |
+
895,gpt-3.5-turbo-0613,0.8937,alpacav1,BLZ_240312,[]
|
343 |
+
896,mixtral-8x7b-instruct-v0.1,0.9478,alpacav1,BLZ_240312,[]
|
344 |
+
897,yi-34b-chat,0.9408,alpacav1,BLZ_240312,[]
|
345 |
+
898,gemini-pro,0.7966,alpacav1,BLZ_240312,[]
|
346 |
+
900,gpt-3.5-turbo-0314,0.8937,alpacav1,BLZ_240312,[]
|
347 |
+
902,tulu-2-dpo-70b,0.9503,alpacav1,BLZ_240312,[]
|
348 |
+
903,vicuna-33b,0.8898999999999999,alpacav1,BLZ_240312,[]
|
349 |
+
904,starling-lm-7b-alpha,0.9198999999999999,alpacav1,BLZ_240312,[]
|
350 |
+
906,llama-2-70b-chat,0.9266,alpacav1,BLZ_240312,[]
|
351 |
+
909,openchat-3.5,0.8851,alpacav1,BLZ_240312,[]
|
352 |
+
911,gpt-3.5-turbo-1106,0.8626,alpacav1,BLZ_240312,[]
|
353 |
+
914,wizardlm-13b-v1.2,0.8917,alpacav1,BLZ_240312,[]
|
354 |
+
915,zephyr-7b-beta,0.9059999999999999,alpacav1,BLZ_240312,[]
|
355 |
+
918,llama-2-13b-chat,0.8109000000000001,alpacav1,BLZ_240312,[]
|
356 |
+
921,zephyr-7b-alpha,0.8576,alpacav1,BLZ_240312,[]
|
357 |
+
924,guanaco-33b,0.6596,alpacav1,BLZ_240312,[]
|
358 |
+
925,llama-2-7b-chat,0.7137,alpacav1,BLZ_240312,[]
|
359 |
+
934,chatglm2-6b,0.47130000000000005,alpacav1,BLZ_240312,[]
|
360 |
+
937,openassistant-pythia-12b,0.2596,alpacav1,BLZ_240312,[]
|
361 |
+
827,gpt-4-1106-preview,0.5,alpacav2,BLZ_240312,[]
|
362 |
+
829,gpt-4-0314,0.221,alpacav2,BLZ_240312,[]
|
363 |
+
830,gpt-4-0613,0.158,alpacav2,BLZ_240312,[]
|
364 |
+
831,mistral-medium,0.21899999999999997,alpacav2,BLZ_240312,[]
|
365 |
+
832,claude-1,0.17,alpacav2,BLZ_240312,[]
|
366 |
+
833,claude-2.0,0.172,alpacav2,BLZ_240312,[]
|
367 |
+
834,gemini-pro-dev-api,0.16899999999999998,alpacav2,BLZ_240312,[]
|
368 |
+
835,claude-2.1,0.157,alpacav2,BLZ_240312,[]
|
369 |
+
836,gpt-3.5-turbo-0613,0.141,alpacav2,BLZ_240312,[]
|
370 |
+
837,mixtral-8x7b-instruct-v0.1,0.183,alpacav2,BLZ_240312,[]
|
371 |
+
838,yi-34b-chat,0.297,alpacav2,BLZ_240312,[]
|
372 |
+
839,gemini-pro,0.16899999999999998,alpacav2,BLZ_240312,[]
|
373 |
+
840,claude-instant-1,0.161,alpacav2,BLZ_240312,[]
|
374 |
+
841,gpt-3.5-turbo-0314,0.096,alpacav2,BLZ_240312,[]
|
375 |
+
842,wizardlm-70b-v1.0,0.14400000000000002,alpacav2,BLZ_240312,[]
|
376 |
+
843,tulu-2-dpo-70b,0.16,alpacav2,BLZ_240312,[]
|
377 |
+
844,vicuna-33b,0.127,alpacav2,BLZ_240312,[]
|
378 |
+
845,starling-lm-7b-alpha,0.142,alpacav2,BLZ_240312,[]
|
379 |
+
846,deepseek-llm-67b-chat,0.121,alpacav2,BLZ_240312,[]
|
380 |
+
847,llama-2-70b-chat,0.139,alpacav2,BLZ_240312,[]
|
381 |
+
849,openhermes-2.5-mistral-7b,0.10300000000000001,alpacav2,BLZ_240312,[]
|
382 |
+
852,gpt-3.5-turbo-1106,0.092,alpacav2,BLZ_240312,[]
|
383 |
+
854,dolphin-2.2.1-mistral-7b,0.09,alpacav2,BLZ_240312,[]
|
384 |
+
855,wizardlm-13b-v1.2,0.12,alpacav2,BLZ_240312,[]
|
385 |
+
856,zephyr-7b-beta,0.11,alpacav2,BLZ_240312,[]
|
386 |
+
859,llama-2-13b-chat,0.077,alpacav2,BLZ_240312,[]
|
387 |
+
860,vicuna-13b,0.067,alpacav2,BLZ_240312,[]
|
388 |
+
862,zephyr-7b-alpha,0.084,alpacav2,BLZ_240312,[]
|
389 |
+
863,qwen-14b-chat,0.075,alpacav2,BLZ_240312,[]
|
390 |
+
865,guanaco-33b,0.05,alpacav2,BLZ_240312,[]
|
391 |
+
866,llama-2-7b-chat,0.0496,alpacav2,BLZ_240312,[]
|
392 |
+
870,vicuna-7b,0.048,alpacav2,BLZ_240312,[]
|
393 |
+
875,chatglm2-6b,0.027999999999999997,alpacav2,BLZ_240312,[]
|
394 |
+
878,openassistant-pythia-12b,0.018000000000000002,alpacav2,BLZ_240312,[]
|
395 |
+
1299,gpt-4-1106-preview,0.32799999999999996,alpacaeval2-lc,BLZ_240312,[]
|
396 |
+
1301,gpt-4-0314,0.21600000000000003,alpacaeval2-lc,BLZ_240312,[]
|
397 |
+
1302,gpt-4-0613,0.18600000000000003,alpacaeval2-lc,BLZ_240312,[]
|
398 |
+
1303,mistral-medium,0.196,alpacaeval2-lc,BLZ_240312,[]
|
399 |
+
1304,claude-1,0.21100000000000002,alpacaeval2-lc,BLZ_240312,[]
|
400 |
+
1305,claude-2.0,0.21600000000000003,alpacaeval2-lc,BLZ_240312,[]
|
401 |
+
1306,gemini-pro-dev-api,0.172,alpacaeval2-lc,BLZ_240312,[]
|
402 |
+
1307,claude-2.1,0.193,alpacaeval2-lc,BLZ_240312,[]
|
403 |
+
1308,gpt-3.5-turbo-0613,0.14300000000000002,alpacaeval2-lc,BLZ_240312,[]
|
404 |
+
1309,mixtral-8x7b-instruct-v0.1,0.168,alpacaeval2-lc,BLZ_240312,[]
|
405 |
+
1310,yi-34b-chat,0.188,alpacaeval2-lc,BLZ_240312,[]
|
406 |
+
1312,claude-instant-1,0.195,alpacaeval2-lc,BLZ_240312,[]
|
407 |
+
1313,gpt-3.5-turbo-0314,0.156,alpacaeval2-lc,BLZ_240312,[]
|
408 |
+
1314,wizardlm-70b-v1.0,0.125,alpacaeval2-lc,BLZ_240312,[]
|
409 |
+
1315,tulu-2-dpo-70b,0.151,alpacaeval2-lc,BLZ_240312,[]
|
410 |
+
1316,vicuna-33b,0.115,alpacaeval2-lc,BLZ_240312,[]
|
411 |
+
1317,starling-lm-7b-alpha,0.10099999999999999,alpacaeval2-lc,BLZ_240312,[]
|
412 |
+
1318,deepseek-llm-67b-chat,0.141,alpacaeval2-lc,BLZ_240312,[]
|
413 |
+
1319,llama-2-70b-chat,0.10400000000000001,alpacaeval2-lc,BLZ_240312,[]
|
414 |
+
1321,openhermes-2.5-mistral-7b,0.126,alpacaeval2-lc,BLZ_240312,[]
|
415 |
+
1324,gpt-3.5-turbo-1106,0.155,alpacaeval2-lc,BLZ_240312,[]
|
416 |
+
1326,dolphin-2.2.1-mistral-7b,0.10800000000000001,alpacaeval2-lc,BLZ_240312,[]
|
417 |
+
1327,wizardlm-13b-v1.2,0.099,alpacaeval2-lc,BLZ_240312,[]
|
418 |
+
1328,zephyr-7b-beta,0.102,alpacaeval2-lc,BLZ_240312,[]
|
419 |
+
1331,llama-2-13b-chat,0.068,alpacaeval2-lc,BLZ_240312,[]
|
420 |
+
1332,vicuna-13b,0.085,alpacaeval2-lc,BLZ_240312,[]
|
421 |
+
1334,zephyr-7b-alpha,0.086,alpacaeval2-lc,BLZ_240312,[]
|
422 |
+
1335,qwen-14b-chat,0.1,alpacaeval2-lc,BLZ_240312,[]
|
423 |
+
1338,llama-2-7b-chat,0.045,alpacaeval2-lc,BLZ_240312,[]
|
424 |
+
1342,vicuna-7b,0.06,alpacaeval2-lc,BLZ_240312,[]
|
425 |
+
0,gpt-4-0125-preview,1.0,arena-elo,BLZ_240312,[]
|
426 |
+
1,gpt-4-1106-preview,0.9992019154030327,arena-elo,BLZ_240312,[]
|
427 |
+
2,bard-gemini-pro,0.9768555466879489,arena-elo,BLZ_240312,[]
|
428 |
+
3,gpt-4-0314,0.9497206703910615,arena-elo,BLZ_240312,[]
|
429 |
+
4,gpt-4-0613,0.9273743016759777,arena-elo,BLZ_240312,[]
|
430 |
+
5,mistral-medium,0.9177972865123704,arena-elo,BLZ_240312,[]
|
431 |
+
6,claude-1,0.9169992019154031,arena-elo,BLZ_240312,[]
|
432 |
+
7,claude-2.0,0.9034317637669593,arena-elo,BLZ_240312,[]
|
433 |
+
8,gemini-pro-dev-api,0.8938547486033519,arena-elo,BLZ_240312,[]
|
434 |
+
9,claude-2.1,0.8930566640063847,arena-elo,BLZ_240312,[]
|
435 |
+
10,gpt-3.5-turbo-0613,0.8922585794094174,arena-elo,BLZ_240312,[]
|
436 |
+
11,mixtral-8x7b-instruct-v0.1,0.8922585794094174,arena-elo,BLZ_240312,[]
|
437 |
+
12,yi-34b-chat,0.8898643256185156,arena-elo,BLZ_240312,[]
|
438 |
+
13,gemini-pro,0.8890662410215483,arena-elo,BLZ_240312,[]
|
439 |
+
14,claude-instant-1,0.8850758180367119,arena-elo,BLZ_240312,[]
|
440 |
+
15,gpt-3.5-turbo-0314,0.8818834796488427,arena-elo,BLZ_240312,[]
|
441 |
+
16,wizardlm-70b-v1.0,0.8818834796488427,arena-elo,BLZ_240312,[]
|
442 |
+
17,tulu-2-dpo-70b,0.8810853950518756,arena-elo,BLZ_240312,[]
|
443 |
+
18,vicuna-33b,0.8723064644852354,arena-elo,BLZ_240312,[]
|
444 |
+
19,starling-lm-7b-alpha,0.8699122106943336,arena-elo,BLZ_240312,[]
|
445 |
+
20,deepseek-llm-67b-chat,0.8635275339185954,arena-elo,BLZ_240312,[]
|
446 |
+
21,llama-2-70b-chat,0.8635275339185954,arena-elo,BLZ_240312,[]
|
447 |
+
22,nv-llama2-70b-steerlm-chat,0.8603351955307262,arena-elo,BLZ_240312,[]
|
448 |
+
23,openhermes-2.5-mistral-7b,0.8603351955307262,arena-elo,BLZ_240312,[]
|
449 |
+
24,openchat-3.5,0.8587390263367917,arena-elo,BLZ_240312,[]
|
450 |
+
25,pplx-70b-online,0.8587390263367917,arena-elo,BLZ_240312,[]
|
451 |
+
26,gpt-3.5-turbo-1106,0.8547486033519553,arena-elo,BLZ_240312,[]
|
452 |
+
27,solar-10.7b-instruct-v1.0,0.8499600957701516,arena-elo,BLZ_240312,[]
|
453 |
+
28,dolphin-2.2.1-mistral-7b,0.8499600957701516,arena-elo,BLZ_240312,[]
|
454 |
+
29,wizardlm-13b-v1.2,0.8443735035913806,arena-elo,BLZ_240312,[]
|
455 |
+
30,zephyr-7b-beta,0.8387869114126097,arena-elo,BLZ_240312,[]
|
456 |
+
31,mpt-30b-chat,0.8332003192338387,arena-elo,BLZ_240312,[]
|
457 |
+
32,codellama-34b-instruct,0.8324022346368715,arena-elo,BLZ_240312,[]
|
458 |
+
33,llama-2-13b-chat,0.8316041500399042,arena-elo,BLZ_240312,[]
|
459 |
+
34,vicuna-13b,0.8300079808459697,arena-elo,BLZ_240312,[]
|
460 |
+
35,pplx-7b-online,0.8284118116520351,arena-elo,BLZ_240312,[]
|
461 |
+
36,zephyr-7b-alpha,0.8276137270550679,arena-elo,BLZ_240312,[]
|
462 |
+
37,qwen-14b-chat,0.825219473264166,arena-elo,BLZ_240312,[]
|
463 |
+
38,falcon-180b-chat,0.8236233040702314,arena-elo,BLZ_240312,[]
|
464 |
+
39,guanaco-33b,0.8236233040702314,arena-elo,BLZ_240312,[]
|
465 |
+
40,llama-2-7b-chat,0.8172386272944933,arena-elo,BLZ_240312,[]
|
466 |
+
41,stripedhyena-nous-7b,0.8140462889066241,arena-elo,BLZ_240312,[]
|
467 |
+
42,mistral-7b-instruct-v0.1,0.8028731045490822,arena-elo,BLZ_240312,[]
|
468 |
+
43,palm-chat-bison-001,0.8028731045490822,arena-elo,BLZ_240312,[]
|
469 |
+
44,vicuna-7b,0.8020750199521149,arena-elo,BLZ_240312,[]
|
470 |
+
45,koala-13b,0.770949720670391,arena-elo,BLZ_240312,[]
|
471 |
+
46,chatglm3-6b,0.7661612130885874,arena-elo,BLZ_240312,[]
|
472 |
+
47,gpt4all-13b-snoozy,0.74780526735834,arena-elo,BLZ_240312,[]
|
473 |
+
48,mpt-7b-chat,0.7430167597765364,arena-elo,BLZ_240312,[]
|
474 |
+
49,chatglm2-6b,0.7422186751795691,arena-elo,BLZ_240312,[]
|
475 |
+
50,rwkv-4-raven-14b,0.7382282521947326,arena-elo,BLZ_240312,[]
|
476 |
+
51,alpaca-13b,0.7214684756584198,arena-elo,BLZ_240312,[]
|
477 |
+
52,openassistant-pythia-12b,0.7158818834796489,arena-elo,BLZ_240312,[]
|
478 |
+
53,chatglm-6b,0.704708699122107,arena-elo,BLZ_240312,[]
|
479 |
+
54,fastchat-t5-3b,0.6975259377494014,arena-elo,BLZ_240312,[]
|
480 |
+
55,stablelm-tuned-alpha-7b,0.6743814844373504,arena-elo,BLZ_240312,[]
|
481 |
+
56,dolly-v2-12b,0.6568236233040702,arena-elo,BLZ_240312,[]
|
482 |
+
57,llama-13b,0.6384676775738228,arena-elo,BLZ_240312,[]
|
483 |
+
542,mixtral-8x7b-instruct-v0.1,0.7641,gpt4all,BLZ_240312,[]
|
484 |
+
543,yi-34b-chat,0.7212999999999999,gpt4all,BLZ_240312,[]
|
485 |
+
550,starling-lm-7b-alpha,0.7272,gpt4all,BLZ_240312,[]
|
486 |
+
554,openhermes-2.5-mistral-7b,0.7312000000000001,gpt4all,BLZ_240312,[]
|
487 |
+
555,openchat-3.5,0.7292000000000001,gpt4all,BLZ_240312,[]
|
488 |
+
558,solar-10.7b-instruct-v1.0,0.7511,gpt4all,BLZ_240312,[]
|
489 |
+
559,dolphin-2.2.1-mistral-7b,0.7223999999999999,gpt4all,BLZ_240312,[]
|
490 |
+
561,zephyr-7b-beta,0.7182999999999999,gpt4all,BLZ_240312,[]
|
491 |
+
565,vicuna-13b,0.631,gpt4all,BLZ_240312,[]
|
492 |
+
567,zephyr-7b-alpha,0.7223999999999999,gpt4all,BLZ_240312,[]
|
493 |
+
573,mistral-7b-instruct-v0.1,0.6795,gpt4all,BLZ_240312,[]
|
494 |
+
575,vicuna-7b,0.61,gpt4all,BLZ_240312,[]
|
495 |
+
576,koala-13b,0.62,gpt4all,BLZ_240312,[]
|
496 |
+
578,gpt4all-13b-snoozy,0.653,gpt4all,BLZ_240312,[]
|
497 |
+
579,mpt-7b-chat,0.648,gpt4all,BLZ_240312,[]
|
498 |
+
583,openassistant-pythia-12b,0.61,gpt4all,BLZ_240312,[]
|
499 |
+
585,fastchat-t5-3b,0.537,gpt4all,BLZ_240312,[]
|
500 |
+
586,stablelm-tuned-alpha-7b,0.513,gpt4all,BLZ_240312,[]
|
501 |
+
588,llama-13b,0.63,gpt4all,BLZ_240312,[]
|
502 |
+
129,mixtral-8x7b-instruct-v0.1,0.7262000000000001,hugging-6,BLZ_240312,[]
|
503 |
+
130,yi-34b-chat,0.6531999999999999,hugging-6,BLZ_240312,[]
|
504 |
+
134,wizardlm-70b-v1.0,0.6125,hugging-6,BLZ_240312,[]
|
505 |
+
135,tulu-2-dpo-70b,0.7376999999999999,hugging-6,BLZ_240312,[]
|
506 |
+
136,vicuna-33b,0.585,hugging-6,BLZ_240312,[]
|
507 |
+
137,starling-lm-7b-alpha,0.6713,hugging-6,BLZ_240312,[]
|
508 |
+
139,llama-2-70b-chat,0.624,hugging-6,BLZ_240312,[]
|
509 |
+
141,openhermes-2.5-mistral-7b,0.6152000000000001,hugging-6,BLZ_240312,[]
|
510 |
+
142,openchat-3.5,0.6124,hugging-6,BLZ_240312,[]
|
511 |
+
145,solar-10.7b-instruct-v1.0,0.742,hugging-6,BLZ_240312,[]
|
512 |
+
146,dolphin-2.2.1-mistral-7b,0.6493000000000001,hugging-6,BLZ_240312,[]
|
513 |
+
147,wizardlm-13b-v1.2,0.5476,hugging-6,BLZ_240312,[]
|
514 |
+
148,zephyr-7b-beta,0.6195,hugging-6,BLZ_240312,[]
|
515 |
+
149,mpt-30b-chat,0.5538000000000001,hugging-6,BLZ_240312,[]
|
516 |
+
150,codellama-34b-instruct,0.5729,hugging-6,BLZ_240312,[]
|
517 |
+
151,llama-2-13b-chat,0.5490999999999999,hugging-6,BLZ_240312,[]
|
518 |
+
152,vicuna-13b,0.5539999999999999,hugging-6,BLZ_240312,[]
|
519 |
+
154,zephyr-7b-alpha,0.595,hugging-6,BLZ_240312,[]
|
520 |
+
156,falcon-180b-chat,0.6785,hugging-6,BLZ_240312,[]
|
521 |
+
158,llama-2-7b-chat,0.5074000000000001,hugging-6,BLZ_240312,[]
|
522 |
+
160,mistral-7b-instruct-v0.1,0.5496,hugging-6,BLZ_240312,[]
|
523 |
+
162,vicuna-7b,0.521,hugging-6,BLZ_240312,[]
|
524 |
+
176,yi-34bx2-moe-60b,0.7672,hugging-6,BLZ_240312,[]
|
525 |
+
947,gpt-4-0314,0.93,llmonitor,BLZ_240312,[]
|
526 |
+
948,gpt-4-0613,0.89,llmonitor,BLZ_240312,[]
|
527 |
+
950,claude-1,0.66,llmonitor,BLZ_240312,[]
|
528 |
+
951,claude-2.0,0.68,llmonitor,BLZ_240312,[]
|
529 |
+
954,gpt-3.5-turbo-0613,0.81,llmonitor,BLZ_240312,[]
|
530 |
+
958,claude-instant-1,0.6,llmonitor,BLZ_240312,[]
|
531 |
+
959,gpt-3.5-turbo-0314,0.79,llmonitor,BLZ_240312,[]
|
532 |
+
965,llama-2-70b-chat,0.6,llmonitor,BLZ_240312,[]
|
533 |
+
975,mpt-30b-chat,0.4,llmonitor,BLZ_240312,[]
|
534 |
+
976,codellama-34b-instruct,0.34,llmonitor,BLZ_240312,[]
|
535 |
+
977,llama-2-13b-chat,0.5,llmonitor,BLZ_240312,[]
|
536 |
+
978,vicuna-13b,0.5,llmonitor,BLZ_240312,[]
|
537 |
+
982,falcon-180b-chat,0.67,llmonitor,BLZ_240312,[]
|
538 |
+
983,guanaco-33b,0.43,llmonitor,BLZ_240312,[]
|
539 |
+
984,llama-2-7b-chat,0.5,llmonitor,BLZ_240312,[]
|
540 |
+
986,mistral-7b-instruct-v0.1,0.57,llmonitor,BLZ_240312,[]
|
541 |
+
987,palm-chat-bison-001,0.57,llmonitor,BLZ_240312,[]
|
542 |
+
988,vicuna-7b,0.41,llmonitor,BLZ_240312,[]
|
543 |
+
989,koala-13b,0.31,llmonitor,BLZ_240312,[]
|
544 |
+
992,mpt-7b-chat,0.43,llmonitor,BLZ_240312,[]
|
545 |
+
1000,dolly-v2-12b,0.23,llmonitor,BLZ_240312,[]
|
546 |
+
59,gpt-4-0125-preview,0.0929,mt-bench,BLZ_240312,[]
|
547 |
+
60,gpt-4-1106-preview,0.0932,mt-bench,BLZ_240312,[]
|
548 |
+
62,gpt-4-0314,0.08960000000000001,mt-bench,BLZ_240312,[]
|
549 |
+
63,gpt-4-0613,0.09179999999999999,mt-bench,BLZ_240312,[]
|
550 |
+
64,mistral-medium,0.0861,mt-bench,BLZ_240312,[]
|
551 |
+
65,claude-1,0.079,mt-bench,BLZ_240312,[]
|
552 |
+
66,claude-2.0,0.0806,mt-bench,BLZ_240312,[]
|
553 |
+
67,gemini-pro-dev-api,0.08039999999999999,mt-bench,BLZ_240312,[]
|
554 |
+
68,claude-2.1,0.0818,mt-bench,BLZ_240312,[]
|
555 |
+
69,gpt-3.5-turbo-0613,0.0839,mt-bench,BLZ_240312,[]
|
556 |
+
70,mixtral-8x7b-instruct-v0.1,0.083,mt-bench,BLZ_240312,[]
|
557 |
+
71,yi-34b-chat,0.07769999999999999,mt-bench,BLZ_240312,[]
|
558 |
+
72,gemini-pro,0.08039999999999999,mt-bench,BLZ_240312,[]
|
559 |
+
73,claude-instant-1,0.0785,mt-bench,BLZ_240312,[]
|
560 |
+
74,gpt-3.5-turbo-0314,0.0794,mt-bench,BLZ_240312,[]
|
561 |
+
75,wizardlm-70b-v1.0,0.0771,mt-bench,BLZ_240312,[]
|
562 |
+
76,tulu-2-dpo-70b,0.0789,mt-bench,BLZ_240312,[]
|
563 |
+
77,vicuna-33b,0.0712,mt-bench,BLZ_240312,[]
|
564 |
+
78,starling-lm-7b-alpha,0.0809,mt-bench,BLZ_240312,[]
|
565 |
+
79,deepseek-llm-67b-chat,0.08529999999999999,mt-bench,BLZ_240312,[]
|
566 |
+
80,llama-2-70b-chat,0.06860000000000001,mt-bench,BLZ_240312,[]
|
567 |
+
81,nv-llama2-70b-steerlm-chat,0.0754,mt-bench,BLZ_240312,[]
|
568 |
+
82,openhermes-2.5-mistral-7b,0.07690000000000001,mt-bench,BLZ_240312,[]
|
569 |
+
83,openchat-3.5,0.0781,mt-bench,BLZ_240312,[]
|
570 |
+
84,pplx-70b-online,0.0588,mt-bench,BLZ_240312,[]
|
571 |
+
85,gpt-3.5-turbo-1106,0.0832,mt-bench,BLZ_240312,[]
|
572 |
+
86,solar-10.7b-instruct-v1.0,0.0758,mt-bench,BLZ_240312,[]
|
573 |
+
88,wizardlm-13b-v1.2,0.07200000000000001,mt-bench,BLZ_240312,[]
|
574 |
+
89,zephyr-7b-beta,0.07339999999999999,mt-bench,BLZ_240312,[]
|
575 |
+
90,mpt-30b-chat,0.0639,mt-bench,BLZ_240312,[]
|
576 |
+
92,llama-2-13b-chat,0.0665,mt-bench,BLZ_240312,[]
|
577 |
+
93,vicuna-13b,0.06570000000000001,mt-bench,BLZ_240312,[]
|
578 |
+
95,zephyr-7b-alpha,0.0688,mt-bench,BLZ_240312,[]
|
579 |
+
96,qwen-14b-chat,0.0696,mt-bench,BLZ_240312,[]
|
580 |
+
98,guanaco-33b,0.0653,mt-bench,BLZ_240312,[]
|
581 |
+
99,llama-2-7b-chat,0.06269999999999999,mt-bench,BLZ_240312,[]
|
582 |
+
101,mistral-7b-instruct-v0.1,0.0684,mt-bench,BLZ_240312,[]
|
583 |
+
102,palm-chat-bison-001,0.064,mt-bench,BLZ_240312,[]
|
584 |
+
103,vicuna-7b,0.0617,mt-bench,BLZ_240312,[]
|
585 |
+
104,koala-13b,0.0535,mt-bench,BLZ_240312,[]
|
586 |
+
106,gpt4all-13b-snoozy,0.0541,mt-bench,BLZ_240312,[]
|
587 |
+
107,mpt-7b-chat,0.0542,mt-bench,BLZ_240312,[]
|
588 |
+
108,chatglm2-6b,0.0496,mt-bench,BLZ_240312,[]
|
589 |
+
109,rwkv-4-raven-14b,0.0398,mt-bench,BLZ_240312,[]
|
590 |
+
110,alpaca-13b,0.0453,mt-bench,BLZ_240312,[]
|
591 |
+
111,openassistant-pythia-12b,0.0432,mt-bench,BLZ_240312,[]
|
592 |
+
112,chatglm-6b,0.045,mt-bench,BLZ_240312,[]
|
593 |
+
113,fastchat-t5-3b,0.0304,mt-bench,BLZ_240312,[]
|
594 |
+
114,stablelm-tuned-alpha-7b,0.0275,mt-bench,BLZ_240312,[]
|
595 |
+
115,dolly-v2-12b,0.032799999999999996,mt-bench,BLZ_240312,[]
|
596 |
+
116,llama-13b,0.026099999999999998,mt-bench,BLZ_240312,[]
|
597 |
+
0,gpt-4-0613,0.957,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
|
598 |
+
1,llama-3-70b,0.902,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
|
599 |
+
2,mixtral-8x22b,0.855,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
|
600 |
+
3,palmyra-x-v3-72b,0.826,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
|
601 |
+
4,gpt-4-turbo-1106-preview,0.821,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
|
602 |
+
5,palm-2-unicorn,0.781,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
|
603 |
+
6,claude-3-opus-20240229,0.762,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
|
604 |
+
7,qwen1.5-72b,0.757,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
|
605 |
+
8,palmyra-x-v2-33b,0.736,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
|
606 |
+
9,yi-34b,0.723,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
|
607 |
+
10,qwen1.5-32b,0.689,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
|
608 |
+
11,claude-v1.3,0.689,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
|
609 |
+
12,mixtral-8x7b-32k-seqlen,0.679,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
|
610 |
+
13,palm-2-bison,0.655,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
|
611 |
+
14,claude-2.0,0.651,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
|
612 |
+
15,deepseek-llm-67b-chat,0.645,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
|
613 |
+
16,llama-2-70b,0.609,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
|
614 |
+
17,claude-2.1,0.594,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
|
615 |
+
18,gpt-3.5-text-davinci-003,0.577,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
|
616 |
+
19,qwen1.5-14b,0.574,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
|
617 |
+
20,claude-instant-1.2,0.551,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
|
618 |
+
21,llama-3-8b,0.519,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
|
619 |
+
22,gpt-3.5-turbo-0613,0.502,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
|
620 |
+
23,gemma-7b,0.47,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
|
621 |
+
24,claude-3-sonnet-20240229,0.468,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
|
622 |
+
25,gpt-3.5-text-davinci-002,0.468,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
|
623 |
+
26,llama-65b,0.466,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
|
624 |
+
27,mistral-large-2402,0.46,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
|
625 |
+
28,cohere-command,0.421,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
|
626 |
+
29,dbrx-instructruct,0.419,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
|
627 |
+
30,mistral-v0.1-7b,0.415,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
|
628 |
+
31,mistral-small-2402,0.415,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
|
629 |
+
32,mistral-medium-2312,0.383,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
|
630 |
+
33,qwen1.5-7b,0.377,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
|
631 |
+
34,claude-3-haiku-20240307,0.377,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
|
632 |
+
35,yi-6b,0.351,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
|
633 |
+
36,llama-2-13b,0.332,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
|
634 |
+
37,jurassic-2-jumbo-178b,0.317,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
|
635 |
+
38,falcon-40b,0.306,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
|
636 |
+
39,phi-2,0.26,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
|
637 |
+
40,jurassic-2-grande-17b,0.253,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
|
638 |
+
41,llama-2-7b,0.234,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
|
639 |
+
42,luminous-supreme-70b,0.213,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
|
640 |
+
43,cohere-command-light,0.166,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
|
641 |
+
44,luminous-extended-30b,0.119,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
|
642 |
+
45,falcon-7b,0.1,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
|
643 |
+
46,olmo-7b,0.083,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
|
644 |
+
47,luminous-base-13b,0.072,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
|
645 |
+
0,llama-2-70b,0.944,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
646 |
+
1,llama-65b,0.908,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
647 |
+
2,text-davinci-002,0.905,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
648 |
+
3,mistral-v0.1-7b,0.884,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
649 |
+
4,cohere-command-beta-52.4b,0.874,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
650 |
+
5,text-davinci-003,0.872,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
651 |
+
6,jurassic-2-jumbo-178b,0.824,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
652 |
+
7,llama-2-13b,0.823,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
653 |
+
8,tnlg-v2-530b,0.787,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
654 |
+
9,gpt-3.5-turbo-0613,0.783,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
655 |
+
10,llama-30b,0.781,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
656 |
+
11,anthropic-lm-v4-s3-52b,0.78,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
657 |
+
12,gpt-3.5-turbo-0301,0.76,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
658 |
+
13,jurassic-2-grande-17b,0.743,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
659 |
+
14,palmyra-x-43b,0.732,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
660 |
+
15,falcon-40b,0.729,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
661 |
+
16,falcon-instruct-40b,0.727,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
662 |
+
17,mpt-instruct-30b,0.716,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
663 |
+
18,mpt-30b,0.714,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
664 |
+
19,j1-grande-v2-beta-17b,0.706,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
665 |
+
20,vicuna-v1.3-13b,0.706,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
666 |
+
21,cohere-command-beta-6.1b,0.675,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
667 |
+
22,cohere-xlarge-v20221108-52.4b,0.664,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
668 |
+
23,luminous-supreme-70b,0.662,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
669 |
+
24,vicuna-v1.3-7b,0.625,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
670 |
+
25,opt-175b,0.609,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
671 |
+
26,llama-2-7b,0.607,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
672 |
+
27,llama-13b,0.595,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
673 |
+
28,instructpalmyra-30b,0.568,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
674 |
+
29,cohere-xlarge-v20220609-52.4b,0.56,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
675 |
+
30,jurassic-2-large-7.5b,0.553,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
676 |
+
31,davinci-175b,0.538,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
677 |
+
32,llama-7b,0.533,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
678 |
+
33,redpajama-incite-instruct-7b,0.524,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
679 |
+
34,j1-jumbo-v1-178b,0.517,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
680 |
+
35,glm-130b,0.512,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
681 |
+
36,luminous-extended-30b,0.485,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
682 |
+
37,opt-66b,0.448,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
683 |
+
38,bloom-176b,0.446,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
684 |
+
39,j1-grande-v1-17b,0.433,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
685 |
+
40,alpaca-7b,0.381,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
686 |
+
41,falcon-7b,0.378,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
687 |
+
42,redpajama-incite-base-7b,0.378,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
688 |
+
43,cohere-large-v20220720-13.1b,0.372,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
689 |
+
44,redpajama-incite-instruct-v1-3b,0.366,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
690 |
+
45,text-curie-001,0.36,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
691 |
+
46,gpt-neox-20b,0.351,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
692 |
+
47,luminous-base-13b,0.315,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
693 |
+
48,cohere-medium-v20221108-6.1b,0.312,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
694 |
+
49,redpajama-incite-base-v1-3b,0.311,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
695 |
+
50,tnlg-v2-6.7b,0.309,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
696 |
+
51,j1-large-v1-7.5b,0.285,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
697 |
+
52,gpt-j-6b,0.273,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
698 |
+
53,pythia-12b,0.257,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
699 |
+
54,curie-6.7b,0.247,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
700 |
+
55,falcon-instruct-7b,0.244,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
701 |
+
56,cohere-medium-v20220720-6.1b,0.23,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
702 |
+
57,text-babbage-001,0.229,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
703 |
+
58,t0pp-11b,0.197,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
704 |
+
59,pythia-6.9b,0.196,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
705 |
+
60,ul2-20b,0.167,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
706 |
+
61,t5-11b,0.131,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
707 |
+
62,babbage-1.3b,0.114,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
708 |
+
63,cohere-small-v20220720-410m,0.109,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
709 |
+
64,ada-350m,0.108,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
710 |
+
65,text-ada-001,0.107,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
711 |
+
66,yalm-100b,0.075,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
|
712 |
+
67,llama-2-70b,0.582,mmlu,helm_classic_240130,[]
|
713 |
+
68,llama-65b,0.584,mmlu,helm_classic_240130,[]
|
714 |
+
69,text-davinci-002,0.568,mmlu,helm_classic_240130,[]
|
715 |
+
70,mistral-v0.1-7b,0.572,mmlu,helm_classic_240130,[]
|
716 |
+
71,cohere-command-beta-52.4b,0.452,mmlu,helm_classic_240130,[]
|
717 |
+
72,text-davinci-003,0.569,mmlu,helm_classic_240130,[]
|
718 |
+
73,jurassic-2-jumbo-178b,0.48,mmlu,helm_classic_240130,[]
|
719 |
+
74,llama-2-13b,0.507,mmlu,helm_classic_240130,[]
|
720 |
+
75,tnlg-v2-530b,0.469,mmlu,helm_classic_240130,[]
|
721 |
+
76,gpt-3.5-turbo-0613,0.391,mmlu,helm_classic_240130,[]
|
722 |
+
77,llama-30b,0.531,mmlu,helm_classic_240130,[]
|
723 |
+
78,anthropic-lm-v4-s3-52b,0.481,mmlu,helm_classic_240130,[]
|
724 |
+
79,gpt-3.5-turbo-0301,0.59,mmlu,helm_classic_240130,[]
|
725 |
+
80,jurassic-2-grande-17b,0.475,mmlu,helm_classic_240130,[]
|
726 |
+
81,palmyra-x-43b,0.609,mmlu,helm_classic_240130,[]
|
727 |
+
82,falcon-40b,0.509,mmlu,helm_classic_240130,[]
|
728 |
+
83,falcon-instruct-40b,0.497,mmlu,helm_classic_240130,[]
|
729 |
+
84,mpt-instruct-30b,0.444,mmlu,helm_classic_240130,[]
|
730 |
+
85,mpt-30b,0.437,mmlu,helm_classic_240130,[]
|
731 |
+
86,j1-grande-v2-beta-17b,0.445,mmlu,helm_classic_240130,[]
|
732 |
+
87,vicuna-v1.3-13b,0.462,mmlu,helm_classic_240130,[]
|
733 |
+
88,cohere-command-beta-6.1b,0.406,mmlu,helm_classic_240130,[]
|
734 |
+
89,cohere-xlarge-v20221108-52.4b,0.382,mmlu,helm_classic_240130,[]
|
735 |
+
90,luminous-supreme-70b,0.38,mmlu,helm_classic_240130,[]
|
736 |
+
91,vicuna-v1.3-7b,0.434,mmlu,helm_classic_240130,[]
|
737 |
+
92,opt-175b,0.318,mmlu,helm_classic_240130,[]
|
738 |
+
93,llama-2-7b,0.431,mmlu,helm_classic_240130,[]
|
739 |
+
94,llama-13b,0.422,mmlu,helm_classic_240130,[]
|
740 |
+
95,instructpalmyra-30b,0.403,mmlu,helm_classic_240130,[]
|
741 |
+
96,cohere-xlarge-v20220609-52.4b,0.353,mmlu,helm_classic_240130,[]
|
742 |
+
97,jurassic-2-large-7.5b,0.339,mmlu,helm_classic_240130,[]
|
743 |
+
98,davinci-175b,0.422,mmlu,helm_classic_240130,[]
|
744 |
+
99,llama-7b,0.321,mmlu,helm_classic_240130,[]
|
745 |
+
100,redpajama-incite-instruct-7b,0.363,mmlu,helm_classic_240130,[]
|
746 |
+
101,j1-jumbo-v1-178b,0.259,mmlu,helm_classic_240130,[]
|
747 |
+
102,glm-130b,0.344,mmlu,helm_classic_240130,[]
|
748 |
+
103,luminous-extended-30b,0.321,mmlu,helm_classic_240130,[]
|
749 |
+
104,opt-66b,0.276,mmlu,helm_classic_240130,[]
|
750 |
+
105,bloom-176b,0.299,mmlu,helm_classic_240130,[]
|
751 |
+
106,j1-grande-v1-17b,0.27,mmlu,helm_classic_240130,[]
|
752 |
+
107,alpaca-7b,0.385,mmlu,helm_classic_240130,[]
|
753 |
+
108,falcon-7b,0.286,mmlu,helm_classic_240130,[]
|
754 |
+
109,redpajama-incite-base-7b,0.302,mmlu,helm_classic_240130,[]
|
755 |
+
110,cohere-large-v20220720-13.1b,0.324,mmlu,helm_classic_240130,[]
|
756 |
+
111,redpajama-incite-instruct-v1-3b,0.257,mmlu,helm_classic_240130,[]
|
757 |
+
112,text-curie-001,0.237,mmlu,helm_classic_240130,[]
|
758 |
+
113,gpt-neox-20b,0.276,mmlu,helm_classic_240130,[]
|
759 |
+
114,luminous-base-13b,0.27,mmlu,helm_classic_240130,[]
|
760 |
+
115,cohere-medium-v20221108-6.1b,0.254,mmlu,helm_classic_240130,[]
|
761 |
+
116,redpajama-incite-base-v1-3b,0.263,mmlu,helm_classic_240130,[]
|
762 |
+
117,tnlg-v2-6.7b,0.242,mmlu,helm_classic_240130,[]
|
763 |
+
118,j1-large-v1-7.5b,0.241,mmlu,helm_classic_240130,[]
|
764 |
+
119,gpt-j-6b,0.249,mmlu,helm_classic_240130,[]
|
765 |
+
120,pythia-12b,0.274,mmlu,helm_classic_240130,[]
|
766 |
+
121,curie-6.7b,0.243,mmlu,helm_classic_240130,[]
|
767 |
+
122,falcon-instruct-7b,0.275,mmlu,helm_classic_240130,[]
|
768 |
+
123,cohere-medium-v20220720-6.1b,0.279,mmlu,helm_classic_240130,[]
|
769 |
+
124,text-babbage-001,0.229,mmlu,helm_classic_240130,[]
|
770 |
+
125,t0pp-11b,0.407,mmlu,helm_classic_240130,[]
|
771 |
+
126,pythia-6.9b,0.236,mmlu,helm_classic_240130,[]
|
772 |
+
127,ul2-20b,0.291,mmlu,helm_classic_240130,[]
|
773 |
+
128,t5-11b,0.29,mmlu,helm_classic_240130,[]
|
774 |
+
129,babbage-1.3b,0.235,mmlu,helm_classic_240130,[]
|
775 |
+
130,cohere-small-v20220720-410m,0.264,mmlu,helm_classic_240130,[]
|
776 |
+
131,ada-350m,0.243,mmlu,helm_classic_240130,[]
|
777 |
+
132,text-ada-001,0.238,mmlu,helm_classic_240130,[]
|
778 |
+
133,yalm-100b,0.243,mmlu,helm_classic_240130,[]
|
779 |
+
0,gpt-4o-0513,35.7,wildbench-mix,wildbench_240612,[]
|
780 |
+
1,gpt-4-turbo-0409,34.6,wildbench-mix,wildbench_240612,[]
|
781 |
+
2,gpt-4-turbo-0125,29.9,wildbench-mix,wildbench_240612,[]
|
782 |
+
3,gemini-1.5-pro,27.8,wildbench-mix,wildbench_240612,[]
|
783 |
+
4,llama-3-70b-inst,21.0,wildbench-mix,wildbench_240612,[]
|
784 |
+
5,claude-3-opus,20.1,wildbench-mix,wildbench_240612,[]
|
785 |
+
6,gemini-1.5-flash,17.4,wildbench-mix,wildbench_240612,[]
|
786 |
+
7,yi-1.5-34b-chat,16.8,wildbench-mix,wildbench_240612,[]
|
787 |
+
8,llama3-inst-8b-simpo,14.0,wildbench-mix,wildbench_240612,[]
|
788 |
+
9,claude-3-sonnet,7.2,wildbench-mix,wildbench_240612,[]
|
789 |
+
10,qwen1.5-72b-chat,4.4,wildbench-mix,wildbench_240612,[]
|
790 |
+
11,command-r-plus,0.4,wildbench-mix,wildbench_240612,[]
|
791 |
+
12,claude-3-haiku,-8.5,wildbench-mix,wildbench_240612,[]
|
792 |
+
13,mistral-large,-10.5,wildbench-mix,wildbench_240612,[]
|
793 |
+
14,starlinglm-7b-beta,-11.9,wildbench-mix,wildbench_240612,[]
|
794 |
+
15,llama-3-8b-inst,-14.6,wildbench-mix,wildbench_240612,[]
|
795 |
+
16,command-r,-16.0,wildbench-mix,wildbench_240612,[]
|
796 |
+
17,mixtral-8x7b-inst,-18.8,wildbench-mix,wildbench_240612,[]
|
797 |
+
18,dbrx-instruct,-21.6,wildbench-mix,wildbench_240612,[]
|
798 |
+
19,yi-1.5-6b-chat,-24.3,wildbench-mix,wildbench_240612,[]
|
799 |
+
20,mistral-7b-inst-v0.2,-25.0,wildbench-mix,wildbench_240612,[]
|
800 |
+
21,tulu-2-dpo-70b,-25.4,wildbench-mix,wildbench_240612,[]
|
801 |
+
22,llama-2-70b-chat,-26.8,wildbench-mix,wildbench_240612,[]
|
802 |
+
23,qwen1.5-7b-chat,-27.0,wildbench-mix,wildbench_240612,[]
|
803 |
+
24,phi-3-medium-128k,-33.3,wildbench-mix,wildbench_240612,[]
|
804 |
+
25,gpt-3.5-turbo-0125,-33.5,wildbench-mix,wildbench_240612,[]
|
805 |
+
26,llama-2-7b-chat,-48.0,wildbench-mix,wildbench_240612,[]
|
806 |
+
27,gemma-7b-it,-57.0,wildbench-mix,wildbench_240612,[]
|
807 |
+
28,gemma-2b-it,-74.1,wildbench-mix,wildbench_240612,[]
|
808 |
+
13,flan-t5-xxl,0.2244897959183673,mmlu_pro,bluebench_v02,[]
|
809 |
+
30,granite-13b-chat-v2,0.2857142857142857,mmlu_pro,bluebench_v02,[]
|
810 |
+
41,granite-13b-instruct-v2,0.0408163265306122,mmlu_pro,bluebench_v02,[]
|
811 |
+
50,granite-7b-lab,0.2423469387755102,mmlu_pro,bluebench_v02,[]
|
812 |
+
60,llama-2-13b-chat,0.0943877551020408,mmlu_pro,bluebench_v02,[]
|
813 |
+
70,llama-2-70b,0.4081632653061224,mmlu_pro,bluebench_v02,[]
|
814 |
+
81,llama-3-70b-instruct,0.4285714285714285,mmlu_pro,bluebench_v02,[]
|
815 |
+
92,llama-3-8b,0.375,mmlu_pro,bluebench_v02,[]
|
816 |
+
103,llama-3-8b-instruct,0.0994897959183673,mmlu_pro,bluebench_v02,[]
|
817 |
+
112,llama-30b,0.3061224489795918,mmlu_pro,bluebench_v02,[]
|
818 |
+
121,llama-7b,0.1326530612244897,mmlu_pro,bluebench_v02,[]
|
819 |
+
132,mistral-v0.1-7b,0.2857142857142857,mmlu_pro,bluebench_v02,[]
|
820 |
+
143,mixtral-8x7b-instruct-v01,0.375,mmlu_pro,bluebench_v02,[]
|
821 |
+
153,vicuna-13b-v1.5-16k,0.2857142857142857,mmlu_pro,bluebench_v02,[]
|
822 |
+
162,vicuna-33b-v1.3,0.2653061224489796,mmlu_pro,bluebench_v02,[]
|
823 |
+
172,vicuna-v1.3-7b,0.1938775510204081,mmlu_pro,bluebench_v02,[]
|
824 |
+
182,vicuna-7b-v1.5,0.2857142857142857,mmlu_pro,bluebench_v02,[]
|
825 |
+
192,zephyr-7b-beta,0.2959183673469387,mmlu_pro,bluebench_v02,[]
|
assets/combined_holistic_20240708.csv
ADDED
@@ -0,0 +1,938 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
model,score,scenario,source,aggragated_from
|
2 |
+
gpt_4_turbo_2024_04_09,82.6,arena_hard,arena_hard_2404,[]
|
3 |
+
gpt_4_0125_preview,78.0,arena_hard,arena_hard_2404,[]
|
4 |
+
gemini_1.5_pro_api_preview,72.0,arena_hard,arena_hard_2404,[]
|
5 |
+
yi_large,63.7,arena_hard,arena_hard_2404,[]
|
6 |
+
claude_3_opus_20240229,60.4,arena_hard,arena_hard_2404,[]
|
7 |
+
glm_4,55.7,arena_hard,arena_hard_2404,[]
|
8 |
+
gpt_4_0314,50.0,arena_hard,arena_hard_2404,[]
|
9 |
+
gemini_1.5_flash_api_preview,49.6,arena_hard,arena_hard_2404,[]
|
10 |
+
claude_3_sonnet_20240229,46.8,arena_hard,arena_hard_2404,[]
|
11 |
+
claude_3_haiku_20240307,41.5,arena_hard,arena_hard_2404,[]
|
12 |
+
llama_3_70b_chat,41.1,arena_hard,arena_hard_2404,[]
|
13 |
+
gpt_4_0613,37.9,arena_hard,arena_hard_2404,[]
|
14 |
+
mistral_large_2402,37.7,arena_hard,arena_hard_2404,[]
|
15 |
+
mixtral_8x22b_instruct_v0.1,36.4,arena_hard,arena_hard_2404,[]
|
16 |
+
qwen1.5_72b_chat,36.1,arena_hard,arena_hard_2404,[]
|
17 |
+
command_r_plus,33.1,arena_hard,arena_hard_2404,[]
|
18 |
+
mistral_medium,31.9,arena_hard,arena_hard_2404,[]
|
19 |
+
mistral_next,27.4,arena_hard,arena_hard_2404,[]
|
20 |
+
gpt_3.5_turbo_0613,24.8,arena_hard,arena_hard_2404,[]
|
21 |
+
claude_2.0,24.0,arena_hard,arena_hard_2404,[]
|
22 |
+
dbrx_instructruct,23.9,arena_hard,arena_hard_2404,[]
|
23 |
+
mixtral_8x7b_instruct_v0.1,23.4,arena_hard,arena_hard_2404,[]
|
24 |
+
gpt_3.5_turbo_0125,23.3,arena_hard,arena_hard_2404,[]
|
25 |
+
yi_34b_chat,23.1,arena_hard,arena_hard_2404,[]
|
26 |
+
starling_lm_7b_beta,23.0,arena_hard,arena_hard_2404,[]
|
27 |
+
claude_2.1,22.8,arena_hard,arena_hard_2404,[]
|
28 |
+
snorkel_mistral_pairrm_dpo,20.7,arena_hard,arena_hard_2404,[]
|
29 |
+
llama_3_8b_chat,20.6,arena_hard,arena_hard_2404,[]
|
30 |
+
gpt_3.5_turbo_1106,18.9,arena_hard,arena_hard_2404,[]
|
31 |
+
gpt_3.5_turbo_0301,18.1,arena_hard,arena_hard_2404,[]
|
32 |
+
gemini_1.0_pro,17.8,arena_hard,arena_hard_2404,[]
|
33 |
+
snowflake_arctic_instruct,17.6,arena_hard,arena_hard_2404,[]
|
34 |
+
command_r,17.0,arena_hard,arena_hard_2404,[]
|
35 |
+
phi_3_mini_128k_instruct,15.4,arena_hard,arena_hard_2404,[]
|
36 |
+
tulu_2_dpo_70b,15.0,arena_hard,arena_hard_2404,[]
|
37 |
+
starling_lm_7b_alpha,12.8,arena_hard,arena_hard_2404,[]
|
38 |
+
mistral_7b_instruct,12.6,arena_hard,arena_hard_2404,[]
|
39 |
+
gemma_1.1_7b_it,12.1,arena_hard,arena_hard_2404,[]
|
40 |
+
llama_2_70b_chat,11.6,arena_hard,arena_hard_2404,[]
|
41 |
+
vicuna_33b_v1.3,8.6,arena_hard,arena_hard_2404,[]
|
42 |
+
gemma_7b_it,7.5,arena_hard,arena_hard_2404,[]
|
43 |
+
llama_2_7b_chat,4.6,arena_hard,arena_hard_2404,[]
|
44 |
+
gemma_1.1_2b_it,3.4,arena_hard,arena_hard_2404,[]
|
45 |
+
gemma_2b_it,3.0,arena_hard,arena_hard_2404,[]
|
46 |
+
gpt_4o_2024_05_13,87.9,mixeval,mixeval_240601,[]
|
47 |
+
claude_3_opus,88.1,mixeval,mixeval_240601,[]
|
48 |
+
gpt_4_turbo_2024_04_09,88.8,mixeval,mixeval_240601,[]
|
49 |
+
gemini_1.5_pro_api_0409,84.2,mixeval,mixeval_240601,[]
|
50 |
+
yi_large_preview,84.4,mixeval,mixeval_240601,[]
|
51 |
+
llama_3_70b_instruct,84.0,mixeval,mixeval_240601,[]
|
52 |
+
qwen_max_0428,86.1,mixeval,mixeval_240601,[]
|
53 |
+
claude_3_sonnet,81.7,mixeval,mixeval_240601,[]
|
54 |
+
reka_core_20240415,83.3,mixeval,mixeval_240601,[]
|
55 |
+
mammoth2_8x7b_plus,81.5,mixeval,mixeval_240601,[]
|
56 |
+
deepseek_v2,83.7,mixeval,mixeval_240601,[]
|
57 |
+
command_r_plus,81.5,mixeval,mixeval_240601,[]
|
58 |
+
yi_1.5_34b_chat,81.7,mixeval,mixeval_240601,[]
|
59 |
+
mistral_large,84.2,mixeval,mixeval_240601,[]
|
60 |
+
qwen1.5_72b_chat,84.1,mixeval,mixeval_240601,[]
|
61 |
+
mistral_medium,81.9,mixeval,mixeval_240601,[]
|
62 |
+
gemini_1.0_pro,78.9,mixeval,mixeval_240601,[]
|
63 |
+
reka_flash_20240226,79.8,mixeval,mixeval_240601,[]
|
64 |
+
mistral_small,81.2,mixeval,mixeval_240601,[]
|
65 |
+
llama_3_8b_instruct,75.0,mixeval,mixeval_240601,[]
|
66 |
+
command_r,77.0,mixeval,mixeval_240601,[]
|
67 |
+
qwen1.5_32b_chat,81.0,mixeval,mixeval_240601,[]
|
68 |
+
gpt_3.5_turbo_0125,79.7,mixeval,mixeval_240601,[]
|
69 |
+
claude_3_haiku,79.7,mixeval,mixeval_240601,[]
|
70 |
+
yi_34b_chat,80.1,mixeval,mixeval_240601,[]
|
71 |
+
mixtral_8x7b_instruct_v0.1,76.4,mixeval,mixeval_240601,[]
|
72 |
+
starling_lm_7b_beta,74.8,mixeval,mixeval_240601,[]
|
73 |
+
yi_1.5_9b_chat,74.2,mixeval,mixeval_240601,[]
|
74 |
+
gemma_1.1_7b_it,69.6,mixeval,mixeval_240601,[]
|
75 |
+
vicuna_33b_v1.3,66.3,mixeval,mixeval_240601,[]
|
76 |
+
llama_2_70b_chat,74.6,mixeval,mixeval_240601,[]
|
77 |
+
map_neo_instruct_v0.1,70.0,mixeval,mixeval_240601,[]
|
78 |
+
mistral_7b_instruct_v0.2,70.0,mixeval,mixeval_240601,[]
|
79 |
+
qwen1.5_7b_chat,71.4,mixeval,mixeval_240601,[]
|
80 |
+
reka_edge_20240208,68.5,mixeval,mixeval_240601,[]
|
81 |
+
zephyr_7b_beta,69.1,mixeval,mixeval_240601,[]
|
82 |
+
llama_2_7b_chat,61.7,mixeval,mixeval_240601,[]
|
83 |
+
yi_6b_chat,65.6,mixeval,mixeval_240601,[]
|
84 |
+
qwen1.5_moe_a2.7b_chat,69.1,mixeval,mixeval_240601,[]
|
85 |
+
gemma_1.1_2b_it,51.9,mixeval,mixeval_240601,[]
|
86 |
+
vicuna_7b_v1.5,60.3,mixeval,mixeval_240601,[]
|
87 |
+
olmo_7b_instruct,55.0,mixeval,mixeval_240601,[]
|
88 |
+
qwen1.5_4b_chat,57.2,mixeval,mixeval_240601,[]
|
89 |
+
jetmoe_8b_chat,51.6,mixeval,mixeval_240601,[]
|
90 |
+
mpt_7b_chat,43.8,mixeval,mixeval_240601,[]
|
91 |
+
llama_3_70b,82.2,mixeval,mixeval_240601,[]
|
92 |
+
qwen1.5_72b,79.5,mixeval,mixeval_240601,[]
|
93 |
+
yi_34b,78.3,mixeval,mixeval_240601,[]
|
94 |
+
qwen1.5_32b,77.6,mixeval,mixeval_240601,[]
|
95 |
+
mixtral_8x7b,74.0,mixeval,mixeval_240601,[]
|
96 |
+
llama_2_70b,73.2,mixeval,mixeval_240601,[]
|
97 |
+
qwen1.5_moe_a2.7b,70.2,mixeval,mixeval_240601,[]
|
98 |
+
qwen1.5_7b,68.2,mixeval,mixeval_240601,[]
|
99 |
+
llama_3_8b,65.1,mixeval,mixeval_240601,[]
|
100 |
+
mistral_7b,64.8,mixeval,mixeval_240601,[]
|
101 |
+
gemma_7b,64.7,mixeval,mixeval_240601,[]
|
102 |
+
yi_6b,63.1,mixeval,mixeval_240601,[]
|
103 |
+
qwen1.5_4b,58.2,mixeval,mixeval_240601,[]
|
104 |
+
jetmoe_8b,57.1,mixeval,mixeval_240601,[]
|
105 |
+
deepseek_7b,52.2,mixeval,mixeval_240601,[]
|
106 |
+
phi_2,51.9,mixeval,mixeval_240601,[]
|
107 |
+
deepseekmoe_16b,51.4,mixeval,mixeval_240601,[]
|
108 |
+
llama_2_7b,43.1,mixeval,mixeval_240601,[]
|
109 |
+
gemma_2b,38.9,mixeval,mixeval_240601,[]
|
110 |
+
olmo_7b,31.8,mixeval,mixeval_240601,[]
|
111 |
+
mpt_7b,30.8,mixeval,mixeval_240601,[]
|
112 |
+
gpt_4_0314,0.57,agieval,BLZ_240312,[]
|
113 |
+
gpt_4_0613,0.57,agieval,BLZ_240312,[]
|
114 |
+
claude_1,0.49700000000000005,agieval,BLZ_240312,[]
|
115 |
+
mixtral_8x7b_instruct_v0.1,0.45299999999999996,agieval,BLZ_240312,[]
|
116 |
+
yi_34b_chat,0.508,agieval,BLZ_240312,[]
|
117 |
+
gpt_3.5_turbo_0314,0.43200000000000005,agieval,BLZ_240312,[]
|
118 |
+
vicuna_33b,0.373,agieval,BLZ_240312,[]
|
119 |
+
starling_lm_7b_alpha,0.401,agieval,BLZ_240312,[]
|
120 |
+
llama_2_70b_chat,0.45,agieval,BLZ_240312,[]
|
121 |
+
openhermes_2.5_mistral_7b,0.43,agieval,BLZ_240312,[]
|
122 |
+
openchat_3.5,0.42700000000000005,agieval,BLZ_240312,[]
|
123 |
+
solar_10.7b_instruct_v1.0,0.47600000000000003,agieval,BLZ_240312,[]
|
124 |
+
dolphin_2.2.1_mistral_7b,0.392,agieval,BLZ_240312,[]
|
125 |
+
zephyr_7b_beta,0.406,agieval,BLZ_240312,[]
|
126 |
+
llama_2_13b_chat,0.336,agieval,BLZ_240312,[]
|
127 |
+
vicuna_13b,0.368,agieval,BLZ_240312,[]
|
128 |
+
zephyr_7b_alpha,0.38,agieval,BLZ_240312,[]
|
129 |
+
qwen_14b_chat,0.396,agieval,BLZ_240312,[]
|
130 |
+
llama_2_7b_chat,0.29600000000000004,agieval,BLZ_240312,[]
|
131 |
+
mistral_7b_instruct_v0.1,0.335,agieval,BLZ_240312,[]
|
132 |
+
vicuna_7b,0.314,agieval,BLZ_240312,[]
|
133 |
+
chatglm3_6b,0.414,agieval,BLZ_240312,[]
|
134 |
+
chatglm_6b,0.325,agieval,BLZ_240312,[]
|
135 |
+
llama_13b,0.205,agieval,BLZ_240312,[]
|
136 |
+
gpt_4_0314,0.963,arc_c,BLZ_240312,[]
|
137 |
+
mistral_medium,0.899,arc_c,BLZ_240312,[]
|
138 |
+
mixtral_8x7b_instruct_v0.1,0.7021999999999999,arc_c,BLZ_240312,[]
|
139 |
+
yi_34b_chat,0.6544,arc_c,BLZ_240312,[]
|
140 |
+
gpt_3.5_turbo_0314,0.855,arc_c,BLZ_240312,[]
|
141 |
+
wizardlm_70b_v1.0,0.6544,arc_c,BLZ_240312,[]
|
142 |
+
tulu_2_dpo_70b,0.721,arc_c,BLZ_240312,[]
|
143 |
+
vicuna_33b,0.6212,arc_c,BLZ_240312,[]
|
144 |
+
starling_lm_7b_alpha,0.6382,arc_c,BLZ_240312,[]
|
145 |
+
llama_2_70b_chat,0.6459,arc_c,BLZ_240312,[]
|
146 |
+
openhermes_2.5_mistral_7b,0.6493000000000001,arc_c,BLZ_240312,[]
|
147 |
+
openchat_3.5,0.6391,arc_c,BLZ_240312,[]
|
148 |
+
solar_10.7b_instruct_v1.0,0.7108,arc_c,BLZ_240312,[]
|
149 |
+
dolphin_2.2.1_mistral_7b,0.6331,arc_c,BLZ_240312,[]
|
150 |
+
wizardlm_13b_v1.2,0.5904,arc_c,BLZ_240312,[]
|
151 |
+
zephyr_7b_beta,0.6203,arc_c,BLZ_240312,[]
|
152 |
+
mpt_30b_chat,0.5870000000000001,arc_c,BLZ_240312,[]
|
153 |
+
codellama_34b_instruct,0.5427000000000001,arc_c,BLZ_240312,[]
|
154 |
+
llama_2_13b_chat,0.5904,arc_c,BLZ_240312,[]
|
155 |
+
vicuna_13b,0.5708,arc_c,BLZ_240312,[]
|
156 |
+
zephyr_7b_alpha,0.6101,arc_c,BLZ_240312,[]
|
157 |
+
falcon_180b_chat,0.6945,arc_c,BLZ_240312,[]
|
158 |
+
llama_2_7b_chat,0.529,arc_c,BLZ_240312,[]
|
159 |
+
mistral_7b_instruct_v0.1,0.5452,arc_c,BLZ_240312,[]
|
160 |
+
vicuna_7b,0.5324,arc_c,BLZ_240312,[]
|
161 |
+
yi_34bx2_moe_60b,0.7108,arc_c,BLZ_240312,[]
|
162 |
+
gpt_4_1106_preview,0.977,alpacav1,BLZ_240312,[]
|
163 |
+
gpt_4_0314,0.9528,alpacav1,BLZ_240312,[]
|
164 |
+
gpt_4_0613,0.9528,alpacav1,BLZ_240312,[]
|
165 |
+
mistral_medium,0.9682999999999999,alpacav1,BLZ_240312,[]
|
166 |
+
claude_1,0.8839,alpacav1,BLZ_240312,[]
|
167 |
+
claude_2.0,0.9136,alpacav1,BLZ_240312,[]
|
168 |
+
gemini_pro_dev_api,0.7966,alpacav1,BLZ_240312,[]
|
169 |
+
claude_2.1,0.8708,alpacav1,BLZ_240312,[]
|
170 |
+
gpt_3.5_turbo_0613,0.8937,alpacav1,BLZ_240312,[]
|
171 |
+
mixtral_8x7b_instruct_v0.1,0.9478,alpacav1,BLZ_240312,[]
|
172 |
+
yi_34b_chat,0.9408,alpacav1,BLZ_240312,[]
|
173 |
+
gemini_pro,0.7966,alpacav1,BLZ_240312,[]
|
174 |
+
gpt_3.5_turbo_0314,0.8937,alpacav1,BLZ_240312,[]
|
175 |
+
tulu_2_dpo_70b,0.9503,alpacav1,BLZ_240312,[]
|
176 |
+
vicuna_33b,0.8898999999999999,alpacav1,BLZ_240312,[]
|
177 |
+
starling_lm_7b_alpha,0.9198999999999999,alpacav1,BLZ_240312,[]
|
178 |
+
llama_2_70b_chat,0.9266,alpacav1,BLZ_240312,[]
|
179 |
+
openchat_3.5,0.8851,alpacav1,BLZ_240312,[]
|
180 |
+
gpt_3.5_turbo_1106,0.8626,alpacav1,BLZ_240312,[]
|
181 |
+
wizardlm_13b_v1.2,0.8917,alpacav1,BLZ_240312,[]
|
182 |
+
zephyr_7b_beta,0.9059999999999999,alpacav1,BLZ_240312,[]
|
183 |
+
llama_2_13b_chat,0.8109000000000001,alpacav1,BLZ_240312,[]
|
184 |
+
zephyr_7b_alpha,0.8576,alpacav1,BLZ_240312,[]
|
185 |
+
guanaco_33b,0.6596,alpacav1,BLZ_240312,[]
|
186 |
+
llama_2_7b_chat,0.7137,alpacav1,BLZ_240312,[]
|
187 |
+
chatglm2_6b,0.47130000000000005,alpacav1,BLZ_240312,[]
|
188 |
+
openassistant_pythia_12b,0.2596,alpacav1,BLZ_240312,[]
|
189 |
+
gpt_4_1106_preview,0.5,alpacav2,BLZ_240312,[]
|
190 |
+
gpt_4_0314,0.221,alpacav2,BLZ_240312,[]
|
191 |
+
gpt_4_0613,0.158,alpacav2,BLZ_240312,[]
|
192 |
+
mistral_medium,0.21899999999999997,alpacav2,BLZ_240312,[]
|
193 |
+
claude_1,0.17,alpacav2,BLZ_240312,[]
|
194 |
+
claude_2.0,0.172,alpacav2,BLZ_240312,[]
|
195 |
+
gemini_pro_dev_api,0.16899999999999998,alpacav2,BLZ_240312,[]
|
196 |
+
claude_2.1,0.157,alpacav2,BLZ_240312,[]
|
197 |
+
gpt_3.5_turbo_0613,0.141,alpacav2,BLZ_240312,[]
|
198 |
+
mixtral_8x7b_instruct_v0.1,0.183,alpacav2,BLZ_240312,[]
|
199 |
+
yi_34b_chat,0.297,alpacav2,BLZ_240312,[]
|
200 |
+
gemini_pro,0.16899999999999998,alpacav2,BLZ_240312,[]
|
201 |
+
claude_instant_1,0.161,alpacav2,BLZ_240312,[]
|
202 |
+
gpt_3.5_turbo_0314,0.096,alpacav2,BLZ_240312,[]
|
203 |
+
wizardlm_70b_v1.0,0.14400000000000002,alpacav2,BLZ_240312,[]
|
204 |
+
tulu_2_dpo_70b,0.16,alpacav2,BLZ_240312,[]
|
205 |
+
vicuna_33b,0.127,alpacav2,BLZ_240312,[]
|
206 |
+
starling_lm_7b_alpha,0.142,alpacav2,BLZ_240312,[]
|
207 |
+
deepseek_llm_67b_chat,0.121,alpacav2,BLZ_240312,[]
|
208 |
+
llama_2_70b_chat,0.139,alpacav2,BLZ_240312,[]
|
209 |
+
openhermes_2.5_mistral_7b,0.10300000000000001,alpacav2,BLZ_240312,[]
|
210 |
+
gpt_3.5_turbo_1106,0.092,alpacav2,BLZ_240312,[]
|
211 |
+
dolphin_2.2.1_mistral_7b,0.09,alpacav2,BLZ_240312,[]
|
212 |
+
wizardlm_13b_v1.2,0.12,alpacav2,BLZ_240312,[]
|
213 |
+
zephyr_7b_beta,0.11,alpacav2,BLZ_240312,[]
|
214 |
+
llama_2_13b_chat,0.077,alpacav2,BLZ_240312,[]
|
215 |
+
vicuna_13b,0.067,alpacav2,BLZ_240312,[]
|
216 |
+
zephyr_7b_alpha,0.084,alpacav2,BLZ_240312,[]
|
217 |
+
qwen_14b_chat,0.075,alpacav2,BLZ_240312,[]
|
218 |
+
guanaco_33b,0.05,alpacav2,BLZ_240312,[]
|
219 |
+
llama_2_7b_chat,0.0496,alpacav2,BLZ_240312,[]
|
220 |
+
vicuna_7b,0.048,alpacav2,BLZ_240312,[]
|
221 |
+
chatglm2_6b,0.027999999999999997,alpacav2,BLZ_240312,[]
|
222 |
+
openassistant_pythia_12b,0.018000000000000002,alpacav2,BLZ_240312,[]
|
223 |
+
gpt_4_1106_preview,0.32799999999999996,alpacaeval2_lc,BLZ_240312,[]
|
224 |
+
gpt_4_0314,0.21600000000000003,alpacaeval2_lc,BLZ_240312,[]
|
225 |
+
gpt_4_0613,0.18600000000000003,alpacaeval2_lc,BLZ_240312,[]
|
226 |
+
mistral_medium,0.196,alpacaeval2_lc,BLZ_240312,[]
|
227 |
+
claude_1,0.21100000000000002,alpacaeval2_lc,BLZ_240312,[]
|
228 |
+
claude_2.0,0.21600000000000003,alpacaeval2_lc,BLZ_240312,[]
|
229 |
+
gemini_pro_dev_api,0.172,alpacaeval2_lc,BLZ_240312,[]
|
230 |
+
claude_2.1,0.193,alpacaeval2_lc,BLZ_240312,[]
|
231 |
+
gpt_3.5_turbo_0613,0.14300000000000002,alpacaeval2_lc,BLZ_240312,[]
|
232 |
+
mixtral_8x7b_instruct_v0.1,0.168,alpacaeval2_lc,BLZ_240312,[]
|
233 |
+
yi_34b_chat,0.188,alpacaeval2_lc,BLZ_240312,[]
|
234 |
+
claude_instant_1,0.195,alpacaeval2_lc,BLZ_240312,[]
|
235 |
+
gpt_3.5_turbo_0314,0.156,alpacaeval2_lc,BLZ_240312,[]
|
236 |
+
wizardlm_70b_v1.0,0.125,alpacaeval2_lc,BLZ_240312,[]
|
237 |
+
tulu_2_dpo_70b,0.151,alpacaeval2_lc,BLZ_240312,[]
|
238 |
+
vicuna_33b,0.115,alpacaeval2_lc,BLZ_240312,[]
|
239 |
+
starling_lm_7b_alpha,0.10099999999999999,alpacaeval2_lc,BLZ_240312,[]
|
240 |
+
deepseek_llm_67b_chat,0.141,alpacaeval2_lc,BLZ_240312,[]
|
241 |
+
llama_2_70b_chat,0.10400000000000001,alpacaeval2_lc,BLZ_240312,[]
|
242 |
+
openhermes_2.5_mistral_7b,0.126,alpacaeval2_lc,BLZ_240312,[]
|
243 |
+
gpt_3.5_turbo_1106,0.155,alpacaeval2_lc,BLZ_240312,[]
|
244 |
+
dolphin_2.2.1_mistral_7b,0.10800000000000001,alpacaeval2_lc,BLZ_240312,[]
|
245 |
+
wizardlm_13b_v1.2,0.099,alpacaeval2_lc,BLZ_240312,[]
|
246 |
+
zephyr_7b_beta,0.102,alpacaeval2_lc,BLZ_240312,[]
|
247 |
+
llama_2_13b_chat,0.068,alpacaeval2_lc,BLZ_240312,[]
|
248 |
+
vicuna_13b,0.085,alpacaeval2_lc,BLZ_240312,[]
|
249 |
+
zephyr_7b_alpha,0.086,alpacaeval2_lc,BLZ_240312,[]
|
250 |
+
qwen_14b_chat,0.1,alpacaeval2_lc,BLZ_240312,[]
|
251 |
+
llama_2_7b_chat,0.045,alpacaeval2_lc,BLZ_240312,[]
|
252 |
+
vicuna_7b,0.06,alpacaeval2_lc,BLZ_240312,[]
|
253 |
+
gpt_4_0125_preview,1.0,arena_elo,BLZ_240312,[]
|
254 |
+
gpt_4_1106_preview,0.9992019154030327,arena_elo,BLZ_240312,[]
|
255 |
+
bard_gemini_pro,0.9768555466879489,arena_elo,BLZ_240312,[]
|
256 |
+
gpt_4_0314,0.9497206703910615,arena_elo,BLZ_240312,[]
|
257 |
+
gpt_4_0613,0.9273743016759777,arena_elo,BLZ_240312,[]
|
258 |
+
mistral_medium,0.9177972865123704,arena_elo,BLZ_240312,[]
|
259 |
+
claude_1,0.9169992019154031,arena_elo,BLZ_240312,[]
|
260 |
+
claude_2.0,0.9034317637669593,arena_elo,BLZ_240312,[]
|
261 |
+
gemini_pro_dev_api,0.8938547486033519,arena_elo,BLZ_240312,[]
|
262 |
+
claude_2.1,0.8930566640063847,arena_elo,BLZ_240312,[]
|
263 |
+
gpt_3.5_turbo_0613,0.8922585794094174,arena_elo,BLZ_240312,[]
|
264 |
+
mixtral_8x7b_instruct_v0.1,0.8922585794094174,arena_elo,BLZ_240312,[]
|
265 |
+
yi_34b_chat,0.8898643256185156,arena_elo,BLZ_240312,[]
|
266 |
+
gemini_pro,0.8890662410215483,arena_elo,BLZ_240312,[]
|
267 |
+
claude_instant_1,0.8850758180367119,arena_elo,BLZ_240312,[]
|
268 |
+
gpt_3.5_turbo_0314,0.8818834796488427,arena_elo,BLZ_240312,[]
|
269 |
+
wizardlm_70b_v1.0,0.8818834796488427,arena_elo,BLZ_240312,[]
|
270 |
+
tulu_2_dpo_70b,0.8810853950518756,arena_elo,BLZ_240312,[]
|
271 |
+
vicuna_33b,0.8723064644852354,arena_elo,BLZ_240312,[]
|
272 |
+
starling_lm_7b_alpha,0.8699122106943336,arena_elo,BLZ_240312,[]
|
273 |
+
deepseek_llm_67b_chat,0.8635275339185954,arena_elo,BLZ_240312,[]
|
274 |
+
llama_2_70b_chat,0.8635275339185954,arena_elo,BLZ_240312,[]
|
275 |
+
nv_llama2_70b_steerlm_chat,0.8603351955307262,arena_elo,BLZ_240312,[]
|
276 |
+
openhermes_2.5_mistral_7b,0.8603351955307262,arena_elo,BLZ_240312,[]
|
277 |
+
openchat_3.5,0.8587390263367917,arena_elo,BLZ_240312,[]
|
278 |
+
pplx_70b_online,0.8587390263367917,arena_elo,BLZ_240312,[]
|
279 |
+
gpt_3.5_turbo_1106,0.8547486033519553,arena_elo,BLZ_240312,[]
|
280 |
+
solar_10.7b_instruct_v1.0,0.8499600957701516,arena_elo,BLZ_240312,[]
|
281 |
+
dolphin_2.2.1_mistral_7b,0.8499600957701516,arena_elo,BLZ_240312,[]
|
282 |
+
wizardlm_13b_v1.2,0.8443735035913806,arena_elo,BLZ_240312,[]
|
283 |
+
zephyr_7b_beta,0.8387869114126097,arena_elo,BLZ_240312,[]
|
284 |
+
mpt_30b_chat,0.8332003192338387,arena_elo,BLZ_240312,[]
|
285 |
+
codellama_34b_instruct,0.8324022346368715,arena_elo,BLZ_240312,[]
|
286 |
+
llama_2_13b_chat,0.8316041500399042,arena_elo,BLZ_240312,[]
|
287 |
+
vicuna_13b,0.8300079808459697,arena_elo,BLZ_240312,[]
|
288 |
+
pplx_7b_online,0.8284118116520351,arena_elo,BLZ_240312,[]
|
289 |
+
zephyr_7b_alpha,0.8276137270550679,arena_elo,BLZ_240312,[]
|
290 |
+
qwen_14b_chat,0.825219473264166,arena_elo,BLZ_240312,[]
|
291 |
+
falcon_180b_chat,0.8236233040702314,arena_elo,BLZ_240312,[]
|
292 |
+
guanaco_33b,0.8236233040702314,arena_elo,BLZ_240312,[]
|
293 |
+
llama_2_7b_chat,0.8172386272944933,arena_elo,BLZ_240312,[]
|
294 |
+
stripedhyena_nous_7b,0.8140462889066241,arena_elo,BLZ_240312,[]
|
295 |
+
mistral_7b_instruct_v0.1,0.8028731045490822,arena_elo,BLZ_240312,[]
|
296 |
+
palm_chat_bison_001,0.8028731045490822,arena_elo,BLZ_240312,[]
|
297 |
+
vicuna_7b,0.8020750199521149,arena_elo,BLZ_240312,[]
|
298 |
+
koala_13b,0.770949720670391,arena_elo,BLZ_240312,[]
|
299 |
+
chatglm3_6b,0.7661612130885874,arena_elo,BLZ_240312,[]
|
300 |
+
gpt4all_13b_snoozy,0.74780526735834,arena_elo,BLZ_240312,[]
|
301 |
+
mpt_7b_chat,0.7430167597765364,arena_elo,BLZ_240312,[]
|
302 |
+
chatglm2_6b,0.7422186751795691,arena_elo,BLZ_240312,[]
|
303 |
+
rwkv_4_raven_14b,0.7382282521947326,arena_elo,BLZ_240312,[]
|
304 |
+
alpaca_13b,0.7214684756584198,arena_elo,BLZ_240312,[]
|
305 |
+
openassistant_pythia_12b,0.7158818834796489,arena_elo,BLZ_240312,[]
|
306 |
+
chatglm_6b,0.704708699122107,arena_elo,BLZ_240312,[]
|
307 |
+
fastchat_t5_3b,0.6975259377494014,arena_elo,BLZ_240312,[]
|
308 |
+
stablelm_tuned_alpha_7b,0.6743814844373504,arena_elo,BLZ_240312,[]
|
309 |
+
dolly_v2_12b,0.6568236233040702,arena_elo,BLZ_240312,[]
|
310 |
+
llama_13b,0.6384676775738228,arena_elo,BLZ_240312,[]
|
311 |
+
gpt_4_1106_preview,0.8390000000000001,bbh,BLZ_240312,[]
|
312 |
+
gpt_4_0314,0.867,bbh,BLZ_240312,[]
|
313 |
+
gpt_4_0613,0.867,bbh,BLZ_240312,[]
|
314 |
+
claude_1,0.6729999999999999,bbh,BLZ_240312,[]
|
315 |
+
gemini_pro_dev_api,0.6559999999999999,bbh,BLZ_240312,[]
|
316 |
+
gpt_3.5_turbo_0613,0.71,bbh,BLZ_240312,[]
|
317 |
+
mixtral_8x7b_instruct_v0.1,0.67,bbh,BLZ_240312,[]
|
318 |
+
yi_34b_chat,0.7170000000000001,bbh,BLZ_240312,[]
|
319 |
+
gemini_pro,0.6559999999999999,bbh,BLZ_240312,[]
|
320 |
+
tulu_2_dpo_70b,0.66,bbh,BLZ_240312,[]
|
321 |
+
vicuna_33b,0.52,bbh,BLZ_240312,[]
|
322 |
+
llama_2_70b_chat,0.608,bbh,BLZ_240312,[]
|
323 |
+
gpt_3.5_turbo_1106,0.71,bbh,BLZ_240312,[]
|
324 |
+
dolphin_2.2.1_mistral_7b,0.598,bbh,BLZ_240312,[]
|
325 |
+
llama_2_13b_chat,0.5820000000000001,bbh,BLZ_240312,[]
|
326 |
+
vicuna_13b,0.515,bbh,BLZ_240312,[]
|
327 |
+
qwen_14b_chat,0.537,bbh,BLZ_240312,[]
|
328 |
+
llama_2_7b_chat,0.35600000000000004,bbh,BLZ_240312,[]
|
329 |
+
mistral_7b_instruct_v0.1,0.5670000000000001,bbh,BLZ_240312,[]
|
330 |
+
vicuna_7b,0.434,bbh,BLZ_240312,[]
|
331 |
+
llama_13b,0.379,bbh,BLZ_240312,[]
|
332 |
+
gpt_4_1106_preview,0.8604999999999999,eq_benchv2,BLZ_240312,[]
|
333 |
+
gpt_4_0314,0.8573000000000001,eq_benchv2,BLZ_240312,[]
|
334 |
+
gpt_4_0613,0.8479000000000001,eq_benchv2,BLZ_240312,[]
|
335 |
+
mistral_medium,0.8256999999999999,eq_benchv2,BLZ_240312,[]
|
336 |
+
claude_1,0.7683,eq_benchv2,BLZ_240312,[]
|
337 |
+
claude_2.0,0.7289,eq_benchv2,BLZ_240312,[]
|
338 |
+
gemini_pro_dev_api,0.7508,eq_benchv2,BLZ_240312,[]
|
339 |
+
claude_2.1,0.7395999999999999,eq_benchv2,BLZ_240312,[]
|
340 |
+
gpt_3.5_turbo_0613,0.6934999999999999,eq_benchv2,BLZ_240312,[]
|
341 |
+
mixtral_8x7b_instruct_v0.1,0.7237,eq_benchv2,BLZ_240312,[]
|
342 |
+
yi_34b_chat,0.7162000000000001,eq_benchv2,BLZ_240312,[]
|
343 |
+
claude_instant_1,0.6904,eq_benchv2,BLZ_240312,[]
|
344 |
+
gpt_3.5_turbo_0314,0.7067,eq_benchv2,BLZ_240312,[]
|
345 |
+
wizardlm_70b_v1.0,0.7128,eq_benchv2,BLZ_240312,[]
|
346 |
+
tulu_2_dpo_70b,0.7663,eq_benchv2,BLZ_240312,[]
|
347 |
+
vicuna_33b,0.6707,eq_benchv2,BLZ_240312,[]
|
348 |
+
starling_lm_7b_alpha,0.7390000000000001,eq_benchv2,BLZ_240312,[]
|
349 |
+
deepseek_llm_67b_chat,0.7753,eq_benchv2,BLZ_240312,[]
|
350 |
+
llama_2_70b_chat,0.7359,eq_benchv2,BLZ_240312,[]
|
351 |
+
openhermes_2.5_mistral_7b,0.6689,eq_benchv2,BLZ_240312,[]
|
352 |
+
openchat_3.5,0.7218000000000001,eq_benchv2,BLZ_240312,[]
|
353 |
+
pplx_70b_online,0.6279,eq_benchv2,BLZ_240312,[]
|
354 |
+
gpt_3.5_turbo_1106,0.7173999999999999,eq_benchv2,BLZ_240312,[]
|
355 |
+
solar_10.7b_instruct_v1.0,0.7353000000000001,eq_benchv2,BLZ_240312,[]
|
356 |
+
dolphin_2.2.1_mistral_7b,0.6992,eq_benchv2,BLZ_240312,[]
|
357 |
+
wizardlm_13b_v1.2,0.6371,eq_benchv2,BLZ_240312,[]
|
358 |
+
zephyr_7b_beta,0.5832999999999999,eq_benchv2,BLZ_240312,[]
|
359 |
+
codellama_34b_instruct,0.4915,eq_benchv2,BLZ_240312,[]
|
360 |
+
llama_2_13b_chat,0.49119999999999997,eq_benchv2,BLZ_240312,[]
|
361 |
+
vicuna_13b,0.6739,eq_benchv2,BLZ_240312,[]
|
362 |
+
pplx_7b_online,0.4891,eq_benchv2,BLZ_240312,[]
|
363 |
+
zephyr_7b_alpha,0.5682,eq_benchv2,BLZ_240312,[]
|
364 |
+
qwen_14b_chat,0.6347,eq_benchv2,BLZ_240312,[]
|
365 |
+
falcon_180b_chat,0.5682,eq_benchv2,BLZ_240312,[]
|
366 |
+
guanaco_33b,0.3611,eq_benchv2,BLZ_240312,[]
|
367 |
+
llama_2_7b_chat,0.3632,eq_benchv2,BLZ_240312,[]
|
368 |
+
stripedhyena_nous_7b,0.5458,eq_benchv2,BLZ_240312,[]
|
369 |
+
mistral_7b_instruct_v0.1,0.5215,eq_benchv2,BLZ_240312,[]
|
370 |
+
yi_34bx2_moe_60b,0.7269,eq_benchv2,BLZ_240312,[]
|
371 |
+
mixtral_8x7b_instruct_v0.1,0.7641,gpt4all,BLZ_240312,[]
|
372 |
+
yi_34b_chat,0.7212999999999999,gpt4all,BLZ_240312,[]
|
373 |
+
starling_lm_7b_alpha,0.7272,gpt4all,BLZ_240312,[]
|
374 |
+
openhermes_2.5_mistral_7b,0.7312000000000001,gpt4all,BLZ_240312,[]
|
375 |
+
openchat_3.5,0.7292000000000001,gpt4all,BLZ_240312,[]
|
376 |
+
solar_10.7b_instruct_v1.0,0.7511,gpt4all,BLZ_240312,[]
|
377 |
+
dolphin_2.2.1_mistral_7b,0.7223999999999999,gpt4all,BLZ_240312,[]
|
378 |
+
zephyr_7b_beta,0.7182999999999999,gpt4all,BLZ_240312,[]
|
379 |
+
vicuna_13b,0.631,gpt4all,BLZ_240312,[]
|
380 |
+
zephyr_7b_alpha,0.7223999999999999,gpt4all,BLZ_240312,[]
|
381 |
+
mistral_7b_instruct_v0.1,0.6795,gpt4all,BLZ_240312,[]
|
382 |
+
vicuna_7b,0.61,gpt4all,BLZ_240312,[]
|
383 |
+
koala_13b,0.62,gpt4all,BLZ_240312,[]
|
384 |
+
gpt4all_13b_snoozy,0.653,gpt4all,BLZ_240312,[]
|
385 |
+
mpt_7b_chat,0.648,gpt4all,BLZ_240312,[]
|
386 |
+
openassistant_pythia_12b,0.61,gpt4all,BLZ_240312,[]
|
387 |
+
fastchat_t5_3b,0.537,gpt4all,BLZ_240312,[]
|
388 |
+
stablelm_tuned_alpha_7b,0.513,gpt4all,BLZ_240312,[]
|
389 |
+
llama_13b,0.63,gpt4all,BLZ_240312,[]
|
390 |
+
mixtral_8x7b_instruct_v0.1,0.7262000000000001,hugging_6,BLZ_240312,[]
|
391 |
+
yi_34b_chat,0.6531999999999999,hugging_6,BLZ_240312,[]
|
392 |
+
wizardlm_70b_v1.0,0.6125,hugging_6,BLZ_240312,[]
|
393 |
+
tulu_2_dpo_70b,0.7376999999999999,hugging_6,BLZ_240312,[]
|
394 |
+
vicuna_33b,0.585,hugging_6,BLZ_240312,[]
|
395 |
+
starling_lm_7b_alpha,0.6713,hugging_6,BLZ_240312,[]
|
396 |
+
llama_2_70b_chat,0.624,hugging_6,BLZ_240312,[]
|
397 |
+
openhermes_2.5_mistral_7b,0.6152000000000001,hugging_6,BLZ_240312,[]
|
398 |
+
openchat_3.5,0.6124,hugging_6,BLZ_240312,[]
|
399 |
+
solar_10.7b_instruct_v1.0,0.742,hugging_6,BLZ_240312,[]
|
400 |
+
dolphin_2.2.1_mistral_7b,0.6493000000000001,hugging_6,BLZ_240312,[]
|
401 |
+
wizardlm_13b_v1.2,0.5476,hugging_6,BLZ_240312,[]
|
402 |
+
zephyr_7b_beta,0.6195,hugging_6,BLZ_240312,[]
|
403 |
+
mpt_30b_chat,0.5538000000000001,hugging_6,BLZ_240312,[]
|
404 |
+
codellama_34b_instruct,0.5729,hugging_6,BLZ_240312,[]
|
405 |
+
llama_2_13b_chat,0.5490999999999999,hugging_6,BLZ_240312,[]
|
406 |
+
vicuna_13b,0.5539999999999999,hugging_6,BLZ_240312,[]
|
407 |
+
zephyr_7b_alpha,0.595,hugging_6,BLZ_240312,[]
|
408 |
+
falcon_180b_chat,0.6785,hugging_6,BLZ_240312,[]
|
409 |
+
llama_2_7b_chat,0.5074000000000001,hugging_6,BLZ_240312,[]
|
410 |
+
mistral_7b_instruct_v0.1,0.5496,hugging_6,BLZ_240312,[]
|
411 |
+
vicuna_7b,0.521,hugging_6,BLZ_240312,[]
|
412 |
+
yi_34bx2_moe_60b,0.7672,hugging_6,BLZ_240312,[]
|
413 |
+
gpt_4_0314,0.93,llmonitor,BLZ_240312,[]
|
414 |
+
gpt_4_0613,0.89,llmonitor,BLZ_240312,[]
|
415 |
+
claude_1,0.66,llmonitor,BLZ_240312,[]
|
416 |
+
claude_2.0,0.68,llmonitor,BLZ_240312,[]
|
417 |
+
gpt_3.5_turbo_0613,0.81,llmonitor,BLZ_240312,[]
|
418 |
+
claude_instant_1,0.6,llmonitor,BLZ_240312,[]
|
419 |
+
gpt_3.5_turbo_0314,0.79,llmonitor,BLZ_240312,[]
|
420 |
+
llama_2_70b_chat,0.6,llmonitor,BLZ_240312,[]
|
421 |
+
mpt_30b_chat,0.4,llmonitor,BLZ_240312,[]
|
422 |
+
codellama_34b_instruct,0.34,llmonitor,BLZ_240312,[]
|
423 |
+
llama_2_13b_chat,0.5,llmonitor,BLZ_240312,[]
|
424 |
+
vicuna_13b,0.5,llmonitor,BLZ_240312,[]
|
425 |
+
falcon_180b_chat,0.67,llmonitor,BLZ_240312,[]
|
426 |
+
guanaco_33b,0.43,llmonitor,BLZ_240312,[]
|
427 |
+
llama_2_7b_chat,0.5,llmonitor,BLZ_240312,[]
|
428 |
+
mistral_7b_instruct_v0.1,0.57,llmonitor,BLZ_240312,[]
|
429 |
+
palm_chat_bison_001,0.57,llmonitor,BLZ_240312,[]
|
430 |
+
vicuna_7b,0.41,llmonitor,BLZ_240312,[]
|
431 |
+
koala_13b,0.31,llmonitor,BLZ_240312,[]
|
432 |
+
mpt_7b_chat,0.43,llmonitor,BLZ_240312,[]
|
433 |
+
dolly_v2_12b,0.23,llmonitor,BLZ_240312,[]
|
434 |
+
mistral_medium,0.654,magi,BLZ_240312,[]
|
435 |
+
gemini_pro_dev_api,0.528,magi,BLZ_240312,[]
|
436 |
+
gpt_3.5_turbo_0613,0.455,magi,BLZ_240312,[]
|
437 |
+
mixtral_8x7b_instruct_v0.1,0.49560000000000004,magi,BLZ_240312,[]
|
438 |
+
yi_34b_chat,0.5821999999999999,magi,BLZ_240312,[]
|
439 |
+
gpt_3.5_turbo_0314,0.512,magi,BLZ_240312,[]
|
440 |
+
wizardlm_70b_v1.0,0.4476,magi,BLZ_240312,[]
|
441 |
+
tulu_2_dpo_70b,0.5212,magi,BLZ_240312,[]
|
442 |
+
vicuna_33b,0.3837,magi,BLZ_240312,[]
|
443 |
+
starling_lm_7b_alpha,0.4304,magi,BLZ_240312,[]
|
444 |
+
deepseek_llm_67b_chat,0.5946,magi,BLZ_240312,[]
|
445 |
+
llama_2_70b_chat,0.39899999999999997,magi,BLZ_240312,[]
|
446 |
+
openhermes_2.5_mistral_7b,0.4236,magi,BLZ_240312,[]
|
447 |
+
openchat_3.5,0.42200000000000004,magi,BLZ_240312,[]
|
448 |
+
gpt_3.5_turbo_1106,0.462,magi,BLZ_240312,[]
|
449 |
+
solar_10.7b_instruct_v1.0,0.4693,magi,BLZ_240312,[]
|
450 |
+
dolphin_2.2.1_mistral_7b,0.3782,magi,BLZ_240312,[]
|
451 |
+
wizardlm_13b_v1.2,0.3678,magi,BLZ_240312,[]
|
452 |
+
zephyr_7b_beta,0.4042,magi,BLZ_240312,[]
|
453 |
+
llama_2_13b_chat,0.37170000000000003,magi,BLZ_240312,[]
|
454 |
+
vicuna_13b,0.36560000000000004,magi,BLZ_240312,[]
|
455 |
+
zephyr_7b_alpha,0.39899999999999997,magi,BLZ_240312,[]
|
456 |
+
qwen_14b_chat,0.4535,magi,BLZ_240312,[]
|
457 |
+
guanaco_33b,0.38659999999999994,magi,BLZ_240312,[]
|
458 |
+
llama_2_7b_chat,0.35969999999999996,magi,BLZ_240312,[]
|
459 |
+
mistral_7b_instruct_v0.1,0.3704,magi,BLZ_240312,[]
|
460 |
+
gpt_4_1106_preview,0.805,mmlu,BLZ_240312,[]
|
461 |
+
gpt_4_0314,0.8640000000000001,mmlu,BLZ_240312,[]
|
462 |
+
mistral_medium,0.753,mmlu,BLZ_240312,[]
|
463 |
+
claude_1,0.77,mmlu,BLZ_240312,[]
|
464 |
+
claude_2.0,0.785,mmlu,BLZ_240312,[]
|
465 |
+
gemini_pro_dev_api,0.718,mmlu,BLZ_240312,[]
|
466 |
+
mixtral_8x7b_instruct_v0.1,0.706,mmlu,BLZ_240312,[]
|
467 |
+
yi_34b_chat,0.735,mmlu,BLZ_240312,[]
|
468 |
+
gemini_pro,0.718,mmlu,BLZ_240312,[]
|
469 |
+
claude_instant_1,0.7340000000000001,mmlu,BLZ_240312,[]
|
470 |
+
gpt_3.5_turbo_0314,0.7,mmlu,BLZ_240312,[]
|
471 |
+
wizardlm_70b_v1.0,0.637,mmlu,BLZ_240312,[]
|
472 |
+
tulu_2_dpo_70b,0.698,mmlu,BLZ_240312,[]
|
473 |
+
vicuna_33b,0.5920000000000001,mmlu,BLZ_240312,[]
|
474 |
+
starling_lm_7b_alpha,0.639,mmlu,BLZ_240312,[]
|
475 |
+
deepseek_llm_67b_chat,0.713,mmlu,BLZ_240312,[]
|
476 |
+
llama_2_70b_chat,0.63,mmlu,BLZ_240312,[]
|
477 |
+
nv_llama2_70b_steerlm_chat,0.685,mmlu,BLZ_240312,[]
|
478 |
+
openhermes_2.5_mistral_7b,0.638,mmlu,BLZ_240312,[]
|
479 |
+
openchat_3.5,0.643,mmlu,BLZ_240312,[]
|
480 |
+
gpt_3.5_turbo_1106,0.6779999999999999,mmlu,BLZ_240312,[]
|
481 |
+
solar_10.7b_instruct_v1.0,0.662,mmlu,BLZ_240312,[]
|
482 |
+
dolphin_2.2.1_mistral_7b,0.632,mmlu,BLZ_240312,[]
|
483 |
+
wizardlm_13b_v1.2,0.527,mmlu,BLZ_240312,[]
|
484 |
+
zephyr_7b_beta,0.614,mmlu,BLZ_240312,[]
|
485 |
+
mpt_30b_chat,0.504,mmlu,BLZ_240312,[]
|
486 |
+
codellama_34b_instruct,0.537,mmlu,BLZ_240312,[]
|
487 |
+
llama_2_13b_chat,0.536,mmlu,BLZ_240312,[]
|
488 |
+
vicuna_13b,0.5579999999999999,mmlu,BLZ_240312,[]
|
489 |
+
zephyr_7b_alpha,0.614,mmlu,BLZ_240312,[]
|
490 |
+
qwen_14b_chat,0.665,mmlu,BLZ_240312,[]
|
491 |
+
falcon_180b_chat,0.68,mmlu,BLZ_240312,[]
|
492 |
+
guanaco_33b,0.5760000000000001,mmlu,BLZ_240312,[]
|
493 |
+
llama_2_7b_chat,0.45799999999999996,mmlu,BLZ_240312,[]
|
494 |
+
mistral_7b_instruct_v0.1,0.5539999999999999,mmlu,BLZ_240312,[]
|
495 |
+
vicuna_7b,0.51,mmlu,BLZ_240312,[]
|
496 |
+
koala_13b,0.447,mmlu,BLZ_240312,[]
|
497 |
+
gpt4all_13b_snoozy,0.43,mmlu,BLZ_240312,[]
|
498 |
+
mpt_7b_chat,0.32,mmlu,BLZ_240312,[]
|
499 |
+
chatglm2_6b,0.455,mmlu,BLZ_240312,[]
|
500 |
+
rwkv_4_raven_14b,0.256,mmlu,BLZ_240312,[]
|
501 |
+
alpaca_13b,0.48100000000000004,mmlu,BLZ_240312,[]
|
502 |
+
openassistant_pythia_12b,0.27,mmlu,BLZ_240312,[]
|
503 |
+
chatglm_6b,0.361,mmlu,BLZ_240312,[]
|
504 |
+
fastchat_t5_3b,0.47700000000000004,mmlu,BLZ_240312,[]
|
505 |
+
stablelm_tuned_alpha_7b,0.244,mmlu,BLZ_240312,[]
|
506 |
+
dolly_v2_12b,0.257,mmlu,BLZ_240312,[]
|
507 |
+
llama_13b,0.47,mmlu,BLZ_240312,[]
|
508 |
+
yi_34bx2_moe_60b,0.775,mmlu,BLZ_240312,[]
|
509 |
+
gpt_4_0125_preview,0.0929,mt_bench,BLZ_240312,[]
|
510 |
+
gpt_4_1106_preview,0.0932,mt_bench,BLZ_240312,[]
|
511 |
+
gpt_4_0314,0.08960000000000001,mt_bench,BLZ_240312,[]
|
512 |
+
gpt_4_0613,0.09179999999999999,mt_bench,BLZ_240312,[]
|
513 |
+
mistral_medium,0.0861,mt_bench,BLZ_240312,[]
|
514 |
+
claude_1,0.079,mt_bench,BLZ_240312,[]
|
515 |
+
claude_2.0,0.0806,mt_bench,BLZ_240312,[]
|
516 |
+
gemini_pro_dev_api,0.08039999999999999,mt_bench,BLZ_240312,[]
|
517 |
+
claude_2.1,0.0818,mt_bench,BLZ_240312,[]
|
518 |
+
gpt_3.5_turbo_0613,0.0839,mt_bench,BLZ_240312,[]
|
519 |
+
mixtral_8x7b_instruct_v0.1,0.083,mt_bench,BLZ_240312,[]
|
520 |
+
yi_34b_chat,0.07769999999999999,mt_bench,BLZ_240312,[]
|
521 |
+
gemini_pro,0.08039999999999999,mt_bench,BLZ_240312,[]
|
522 |
+
claude_instant_1,0.0785,mt_bench,BLZ_240312,[]
|
523 |
+
gpt_3.5_turbo_0314,0.0794,mt_bench,BLZ_240312,[]
|
524 |
+
wizardlm_70b_v1.0,0.0771,mt_bench,BLZ_240312,[]
|
525 |
+
tulu_2_dpo_70b,0.0789,mt_bench,BLZ_240312,[]
|
526 |
+
vicuna_33b,0.0712,mt_bench,BLZ_240312,[]
|
527 |
+
starling_lm_7b_alpha,0.0809,mt_bench,BLZ_240312,[]
|
528 |
+
deepseek_llm_67b_chat,0.08529999999999999,mt_bench,BLZ_240312,[]
|
529 |
+
llama_2_70b_chat,0.06860000000000001,mt_bench,BLZ_240312,[]
|
530 |
+
nv_llama2_70b_steerlm_chat,0.0754,mt_bench,BLZ_240312,[]
|
531 |
+
openhermes_2.5_mistral_7b,0.07690000000000001,mt_bench,BLZ_240312,[]
|
532 |
+
openchat_3.5,0.0781,mt_bench,BLZ_240312,[]
|
533 |
+
pplx_70b_online,0.0588,mt_bench,BLZ_240312,[]
|
534 |
+
gpt_3.5_turbo_1106,0.0832,mt_bench,BLZ_240312,[]
|
535 |
+
solar_10.7b_instruct_v1.0,0.0758,mt_bench,BLZ_240312,[]
|
536 |
+
wizardlm_13b_v1.2,0.07200000000000001,mt_bench,BLZ_240312,[]
|
537 |
+
zephyr_7b_beta,0.07339999999999999,mt_bench,BLZ_240312,[]
|
538 |
+
mpt_30b_chat,0.0639,mt_bench,BLZ_240312,[]
|
539 |
+
llama_2_13b_chat,0.0665,mt_bench,BLZ_240312,[]
|
540 |
+
vicuna_13b,0.06570000000000001,mt_bench,BLZ_240312,[]
|
541 |
+
zephyr_7b_alpha,0.0688,mt_bench,BLZ_240312,[]
|
542 |
+
qwen_14b_chat,0.0696,mt_bench,BLZ_240312,[]
|
543 |
+
guanaco_33b,0.0653,mt_bench,BLZ_240312,[]
|
544 |
+
llama_2_7b_chat,0.06269999999999999,mt_bench,BLZ_240312,[]
|
545 |
+
mistral_7b_instruct_v0.1,0.0684,mt_bench,BLZ_240312,[]
|
546 |
+
palm_chat_bison_001,0.064,mt_bench,BLZ_240312,[]
|
547 |
+
vicuna_7b,0.0617,mt_bench,BLZ_240312,[]
|
548 |
+
koala_13b,0.0535,mt_bench,BLZ_240312,[]
|
549 |
+
gpt4all_13b_snoozy,0.0541,mt_bench,BLZ_240312,[]
|
550 |
+
mpt_7b_chat,0.0542,mt_bench,BLZ_240312,[]
|
551 |
+
chatglm2_6b,0.0496,mt_bench,BLZ_240312,[]
|
552 |
+
rwkv_4_raven_14b,0.0398,mt_bench,BLZ_240312,[]
|
553 |
+
alpaca_13b,0.0453,mt_bench,BLZ_240312,[]
|
554 |
+
openassistant_pythia_12b,0.0432,mt_bench,BLZ_240312,[]
|
555 |
+
chatglm_6b,0.045,mt_bench,BLZ_240312,[]
|
556 |
+
fastchat_t5_3b,0.0304,mt_bench,BLZ_240312,[]
|
557 |
+
stablelm_tuned_alpha_7b,0.0275,mt_bench,BLZ_240312,[]
|
558 |
+
dolly_v2_12b,0.032799999999999996,mt_bench,BLZ_240312,[]
|
559 |
+
llama_13b,0.026099999999999998,mt_bench,BLZ_240312,[]
|
560 |
+
gpt_4_0613,0.735,mmlu,helm_lite_240610,[]
|
561 |
+
llama_3_70b,0.695,mmlu,helm_lite_240610,[]
|
562 |
+
mixtral_8x22b,0.701,mmlu,helm_lite_240610,[]
|
563 |
+
palmyra_x_v3_72b,0.702,mmlu,helm_lite_240610,[]
|
564 |
+
gpt_4_turbo_1106_preview,0.699,mmlu,helm_lite_240610,[]
|
565 |
+
palm_2_unicorn,0.702,mmlu,helm_lite_240610,[]
|
566 |
+
claude_3_opus_20240229,0.768,mmlu,helm_lite_240610,[]
|
567 |
+
qwen1.5_72b,0.647,mmlu,helm_lite_240610,[]
|
568 |
+
palmyra_x_v2_33b,0.621,mmlu,helm_lite_240610,[]
|
569 |
+
yi_34b,0.65,mmlu,helm_lite_240610,[]
|
570 |
+
qwen1.5_32b,0.628,mmlu,helm_lite_240610,[]
|
571 |
+
claude_v1.3,0.631,mmlu,helm_lite_240610,[]
|
572 |
+
mixtral_8x7b_32k_seqlen,0.649,mmlu,helm_lite_240610,[]
|
573 |
+
palm_2_bison,0.608,mmlu,helm_lite_240610,[]
|
574 |
+
claude_2.0,0.639,mmlu,helm_lite_240610,[]
|
575 |
+
deepseek_llm_chat_67b,0.641,mmlu,helm_lite_240610,[]
|
576 |
+
llama_2_70b,0.58,mmlu,helm_lite_240610,[]
|
577 |
+
claude_2.1,0.643,mmlu,helm_lite_240610,[]
|
578 |
+
gpt_3.5_text_davinci_003,0.555,mmlu,helm_lite_240610,[]
|
579 |
+
qwen1.5_14b,0.626,mmlu,helm_lite_240610,[]
|
580 |
+
claude_instant_1.2,0.631,mmlu,helm_lite_240610,[]
|
581 |
+
llama_3_8b,0.602,mmlu,helm_lite_240610,[]
|
582 |
+
gpt_3.5_turbo_0613,0.614,mmlu,helm_lite_240610,[]
|
583 |
+
gemma_7b,0.571,mmlu,helm_lite_240610,[]
|
584 |
+
claude_3_sonnet_20240229,0.652,mmlu,helm_lite_240610,[]
|
585 |
+
gpt_3.5_text_davinci_002,0.568,mmlu,helm_lite_240610,[]
|
586 |
+
llama_65b,0.584,mmlu,helm_lite_240610,[]
|
587 |
+
mistral_large_2402,0.638,mmlu,helm_lite_240610,[]
|
588 |
+
cohere_command,0.525,mmlu,helm_lite_240610,[]
|
589 |
+
dbrx_instructruct,0.643,mmlu,helm_lite_240610,[]
|
590 |
+
mistral_v0.1_7b,0.584,mmlu,helm_lite_240610,[]
|
591 |
+
mistral_small_2402,0.593,mmlu,helm_lite_240610,[]
|
592 |
+
mistral_medium_2312,0.618,mmlu,helm_lite_240610,[]
|
593 |
+
qwen1.5_7b,0.569,mmlu,helm_lite_240610,[]
|
594 |
+
claude_3_haiku_20240307,0.662,mmlu,helm_lite_240610,[]
|
595 |
+
yi_6b,0.53,mmlu,helm_lite_240610,[]
|
596 |
+
llama_2_13b,0.505,mmlu,helm_lite_240610,[]
|
597 |
+
jurassic_2_jumbo_178b,0.483,mmlu,helm_lite_240610,[]
|
598 |
+
falcon_40b,0.507,mmlu,helm_lite_240610,[]
|
599 |
+
phi_2,0.518,mmlu,helm_lite_240610,[]
|
600 |
+
jurassic_2_grande_17b,0.471,mmlu,helm_lite_240610,[]
|
601 |
+
llama_2_7b,0.425,mmlu,helm_lite_240610,[]
|
602 |
+
luminous_supreme_70b,0.316,mmlu,helm_lite_240610,[]
|
603 |
+
cohere_command_light,0.386,mmlu,helm_lite_240610,[]
|
604 |
+
luminous_extended_30b,0.248,mmlu,helm_lite_240610,[]
|
605 |
+
falcon_7b,0.288,mmlu,helm_lite_240610,[]
|
606 |
+
olmo_7b,0.305,mmlu,helm_lite_240610,[]
|
607 |
+
luminous_base_13b,0.243,mmlu,helm_lite_240610,[]
|
608 |
+
llama_2_70b,0.582,mmlu,helm_classic_240130,[]
|
609 |
+
llama_65b,0.584,mmlu,helm_classic_240130,[]
|
610 |
+
text_davinci_002,0.568,mmlu,helm_classic_240130,[]
|
611 |
+
mistral_v0.1_7b,0.572,mmlu,helm_classic_240130,[]
|
612 |
+
cohere_command_beta_52.4b,0.452,mmlu,helm_classic_240130,[]
|
613 |
+
text_davinci_003,0.569,mmlu,helm_classic_240130,[]
|
614 |
+
jurassic_2_jumbo_178b,0.48,mmlu,helm_classic_240130,[]
|
615 |
+
llama_2_13b,0.507,mmlu,helm_classic_240130,[]
|
616 |
+
tnlg_v2_530b,0.469,mmlu,helm_classic_240130,[]
|
617 |
+
gpt_3.5_turbo_0613,0.391,mmlu,helm_classic_240130,[]
|
618 |
+
llama_30b,0.531,mmlu,helm_classic_240130,[]
|
619 |
+
anthropic_lm_v4_s3_52b,0.481,mmlu,helm_classic_240130,[]
|
620 |
+
gpt_3.5_turbo_0301,0.59,mmlu,helm_classic_240130,[]
|
621 |
+
jurassic_2_grande_17b,0.475,mmlu,helm_classic_240130,[]
|
622 |
+
palmyra_x_43b,0.609,mmlu,helm_classic_240130,[]
|
623 |
+
falcon_40b,0.509,mmlu,helm_classic_240130,[]
|
624 |
+
falcon_instruct_40b,0.497,mmlu,helm_classic_240130,[]
|
625 |
+
mpt_instruct_30b,0.444,mmlu,helm_classic_240130,[]
|
626 |
+
mpt_30b,0.437,mmlu,helm_classic_240130,[]
|
627 |
+
j1_grande_v2_beta_17b,0.445,mmlu,helm_classic_240130,[]
|
628 |
+
vicuna_v1.3_13b,0.462,mmlu,helm_classic_240130,[]
|
629 |
+
cohere_command_beta_6.1b,0.406,mmlu,helm_classic_240130,[]
|
630 |
+
cohere_xlarge_v20221108_52.4b,0.382,mmlu,helm_classic_240130,[]
|
631 |
+
luminous_supreme_70b,0.38,mmlu,helm_classic_240130,[]
|
632 |
+
vicuna_v1.3_7b,0.434,mmlu,helm_classic_240130,[]
|
633 |
+
opt_175b,0.318,mmlu,helm_classic_240130,[]
|
634 |
+
llama_2_7b,0.431,mmlu,helm_classic_240130,[]
|
635 |
+
llama_13b,0.422,mmlu,helm_classic_240130,[]
|
636 |
+
instructpalmyra_30b,0.403,mmlu,helm_classic_240130,[]
|
637 |
+
cohere_xlarge_v20220609_52.4b,0.353,mmlu,helm_classic_240130,[]
|
638 |
+
jurassic_2_large_7.5b,0.339,mmlu,helm_classic_240130,[]
|
639 |
+
davinci_175b,0.422,mmlu,helm_classic_240130,[]
|
640 |
+
llama_7b,0.321,mmlu,helm_classic_240130,[]
|
641 |
+
redpajama_incite_instruct_7b,0.363,mmlu,helm_classic_240130,[]
|
642 |
+
j1_jumbo_v1_178b,0.259,mmlu,helm_classic_240130,[]
|
643 |
+
glm_130b,0.344,mmlu,helm_classic_240130,[]
|
644 |
+
luminous_extended_30b,0.321,mmlu,helm_classic_240130,[]
|
645 |
+
opt_66b,0.276,mmlu,helm_classic_240130,[]
|
646 |
+
bloom_176b,0.299,mmlu,helm_classic_240130,[]
|
647 |
+
j1_grande_v1_17b,0.27,mmlu,helm_classic_240130,[]
|
648 |
+
alpaca_7b,0.385,mmlu,helm_classic_240130,[]
|
649 |
+
falcon_7b,0.286,mmlu,helm_classic_240130,[]
|
650 |
+
redpajama_incite_base_7b,0.302,mmlu,helm_classic_240130,[]
|
651 |
+
cohere_large_v20220720_13.1b,0.324,mmlu,helm_classic_240130,[]
|
652 |
+
redpajama_incite_instruct_v1_3b,0.257,mmlu,helm_classic_240130,[]
|
653 |
+
text_curie_001,0.237,mmlu,helm_classic_240130,[]
|
654 |
+
gpt_neox_20b,0.276,mmlu,helm_classic_240130,[]
|
655 |
+
luminous_base_13b,0.27,mmlu,helm_classic_240130,[]
|
656 |
+
cohere_medium_v20221108_6.1b,0.254,mmlu,helm_classic_240130,[]
|
657 |
+
redpajama_incite_base_v1_3b,0.263,mmlu,helm_classic_240130,[]
|
658 |
+
tnlg_v2_6.7b,0.242,mmlu,helm_classic_240130,[]
|
659 |
+
j1_large_v1_7.5b,0.241,mmlu,helm_classic_240130,[]
|
660 |
+
gpt_j_6b,0.249,mmlu,helm_classic_240130,[]
|
661 |
+
pythia_12b,0.274,mmlu,helm_classic_240130,[]
|
662 |
+
curie_6.7b,0.243,mmlu,helm_classic_240130,[]
|
663 |
+
falcon_instruct_7b,0.275,mmlu,helm_classic_240130,[]
|
664 |
+
cohere_medium_v20220720_6.1b,0.279,mmlu,helm_classic_240130,[]
|
665 |
+
text_babbage_001,0.229,mmlu,helm_classic_240130,[]
|
666 |
+
t0pp_11b,0.407,mmlu,helm_classic_240130,[]
|
667 |
+
pythia_6.9b,0.236,mmlu,helm_classic_240130,[]
|
668 |
+
ul2_20b,0.291,mmlu,helm_classic_240130,[]
|
669 |
+
t5_11b,0.29,mmlu,helm_classic_240130,[]
|
670 |
+
babbage_1.3b,0.235,mmlu,helm_classic_240130,[]
|
671 |
+
cohere_small_v20220720_410m,0.264,mmlu,helm_classic_240130,[]
|
672 |
+
ada_350m,0.243,mmlu,helm_classic_240130,[]
|
673 |
+
text_ada_001,0.238,mmlu,helm_classic_240130,[]
|
674 |
+
yalm_100b,0.243,mmlu,helm_classic_240130,[]
|
675 |
+
aya_101,0.029411764705882353,biggen_mwr,biggen_240612,[]
|
676 |
+
c4ai_command_r_plus_gptq,0.8382352941176471,biggen_mwr,biggen_240612,[]
|
677 |
+
c4ai_command_r_v01,0.6948529411764706,biggen_mwr,biggen_240612,[]
|
678 |
+
claude_3_haiku_20240307,0.9252450980392157,biggen_mwr,biggen_240612,[]
|
679 |
+
claude_3_opus_20240229,0.9681372549019608,biggen_mwr,biggen_240612,[]
|
680 |
+
claude_3_sonnet_20240229,0.9240196078431373,biggen_mwr,biggen_240612,[]
|
681 |
+
codellama_13b,0.07598039215686275,biggen_mwr,biggen_240612,[]
|
682 |
+
codellama_13b_instruct,0.4276960784313726,biggen_mwr,biggen_240612,[]
|
683 |
+
codellama_34b,0.1482843137254902,biggen_mwr,biggen_240612,[]
|
684 |
+
codellama_34b_instruct,0.5098039215686274,biggen_mwr,biggen_240612,[]
|
685 |
+
codellama_70b,0.18872549019607843,biggen_mwr,biggen_240612,[]
|
686 |
+
codellama_70b_instruct,0.27450980392156865,biggen_mwr,biggen_240612,[]
|
687 |
+
codellama_7b,0.05514705882352941,biggen_mwr,biggen_240612,[]
|
688 |
+
codellama_7b_instruct,0.36519607843137253,biggen_mwr,biggen_240612,[]
|
689 |
+
codetulu_2_13b,0.43137254901960786,biggen_mwr,biggen_240612,[]
|
690 |
+
codetulu_2_34b,0.5441176470588235,biggen_mwr,biggen_240612,[]
|
691 |
+
codetulu_2_7b,0.32598039215686275,biggen_mwr,biggen_240612,[]
|
692 |
+
gemini_1.0_pro,0.7107843137254902,biggen_mwr,biggen_240612,[]
|
693 |
+
gemini_flash_1.5,0.866421568627451,biggen_mwr,biggen_240612,[]
|
694 |
+
gemini_pro_1.5,0.8676470588235294,biggen_mwr,biggen_240612,[]
|
695 |
+
gemma_1.1_2b_it,0.33578431372549017,biggen_mwr,biggen_240612,[]
|
696 |
+
gemma_1.1_7b_it,0.5551470588235294,biggen_mwr,biggen_240612,[]
|
697 |
+
gemma_2b,0.09803921568627451,biggen_mwr,biggen_240612,[]
|
698 |
+
gemma_2b_it,0.3333333333333333,biggen_mwr,biggen_240612,[]
|
699 |
+
gemma_7b,0.013480392156862746,biggen_mwr,biggen_240612,[]
|
700 |
+
gemma_7b_it,0.40931372549019607,biggen_mwr,biggen_240612,[]
|
701 |
+
gpt_3.5_turbo_0125,0.7757352941176471,biggen_mwr,biggen_240612,[]
|
702 |
+
gpt_3.5_turbo_1106,0.758578431372549,biggen_mwr,biggen_240612,[]
|
703 |
+
gpt_4_0125_preview,0.9779411764705882,biggen_mwr,biggen_240612,[]
|
704 |
+
gpt_4_1106_preview,0.9889705882352942,biggen_mwr,biggen_240612,[]
|
705 |
+
gpt_4_turbo_2024_04_09,0.9558823529411765,biggen_mwr,biggen_240612,[]
|
706 |
+
gpt_4o_2024_05_13,0.9436274509803921,biggen_mwr,biggen_240612,[]
|
707 |
+
llama_2_13b,0.20220588235294118,biggen_mwr,biggen_240612,[]
|
708 |
+
llama_2_13b_chat,0.5968137254901961,biggen_mwr,biggen_240612,[]
|
709 |
+
llama_2_70b,0.4656862745098039,biggen_mwr,biggen_240612,[]
|
710 |
+
llama_2_70b_chat,0.7205882352941176,biggen_mwr,biggen_240612,[]
|
711 |
+
llama_2_7b,0.1446078431372549,biggen_mwr,biggen_240612,[]
|
712 |
+
llama_2_7b_chat,0.5355392156862745,biggen_mwr,biggen_240612,[]
|
713 |
+
llemma_34b,0.21200980392156862,biggen_mwr,biggen_240612,[]
|
714 |
+
llemma_7b,0.11029411764705882,biggen_mwr,biggen_240612,[]
|
715 |
+
meta_llama_3_70b,0.36887254901960786,biggen_mwr,biggen_240612,[]
|
716 |
+
meta_llama_3_70b_instruct,0.875,biggen_mwr,biggen_240612,[]
|
717 |
+
meta_llama_3_8b,0.2377450980392157,biggen_mwr,biggen_240612,[]
|
718 |
+
meta_llama_3_8b_instruct,0.7328431372549019,biggen_mwr,biggen_240612,[]
|
719 |
+
mistral_7b_instruct_v0.2,0.7156862745098039,biggen_mwr,biggen_240612,[]
|
720 |
+
mistral_7b_v0.1,0.3272058823529412,biggen_mwr,biggen_240612,[]
|
721 |
+
mistral_7b_v0.2,0.3137254901960784,biggen_mwr,biggen_240612,[]
|
722 |
+
mistral_large_hjpark,0.8762254901960784,biggen_mwr,biggen_240612,[]
|
723 |
+
mistral_medium_hjpark,0.8970588235294118,biggen_mwr,biggen_240612,[]
|
724 |
+
mistral_orpo_alpha,0.5392156862745098,biggen_mwr,biggen_240612,[]
|
725 |
+
mistral_orpo_beta,0.5477941176470589,biggen_mwr,biggen_240612,[]
|
726 |
+
mixtral_8x22b_instruct_v0.1_awq,0.8198529411764706,biggen_mwr,biggen_240612,[]
|
727 |
+
mixtral_8x22b_v0.1_awq,0.5968137254901961,biggen_mwr,biggen_240612,[]
|
728 |
+
mixtral_8x7b_instruct_v0.1,0.7647058823529411,biggen_mwr,biggen_240612,[]
|
729 |
+
mixtral_8x7b_v0.1,0.5453431372549019,biggen_mwr,biggen_240612,[]
|
730 |
+
nous_hermes_2_mistral_7b_dpo,0.571078431372549,biggen_mwr,biggen_240612,[]
|
731 |
+
nous_hermes_2_mixtral_8x7b_dpo,0.7095588235294118,biggen_mwr,biggen_240612,[]
|
732 |
+
nous_hermes_2_mixtral_8x7b_sft,0.6262254901960784,biggen_mwr,biggen_240612,[]
|
733 |
+
nous_hermes_2_yi_34b,0.5906862745098039,biggen_mwr,biggen_240612,[]
|
734 |
+
olmo_1b,0.028186274509803922,biggen_mwr,biggen_240612,[]
|
735 |
+
olmo_7b,0.07107843137254902,biggen_mwr,biggen_240612,[]
|
736 |
+
olmo_7b_instruct,0.30269607843137253,biggen_mwr,biggen_240612,[]
|
737 |
+
olmo_7b_sft,0.2549019607843137,biggen_mwr,biggen_240612,[]
|
738 |
+
openchat_3.5_0106,0.6825980392156863,biggen_mwr,biggen_240612,[]
|
739 |
+
openhermes_2.5_mistral_7b,0.4583333333333333,biggen_mwr,biggen_240612,[]
|
740 |
+
openhermes_2_mistral_7b,0.5122549019607843,biggen_mwr,biggen_240612,[]
|
741 |
+
orca_2_13b,0.17401960784313725,biggen_mwr,biggen_240612,[]
|
742 |
+
orca_2_7b,0.08700980392156862,biggen_mwr,biggen_240612,[]
|
743 |
+
phi_1,0.0,biggen_mwr,biggen_240612,[]
|
744 |
+
phi_1_5,0.15318627450980393,biggen_mwr,biggen_240612,[]
|
745 |
+
phi_2,0.29044117647058826,biggen_mwr,biggen_240612,[]
|
746 |
+
phi_3_mini_128k_instruct,0.6911764705882353,biggen_mwr,biggen_240612,[]
|
747 |
+
phi_3_mini_4k_instruct,0.7867647058823529,biggen_mwr,biggen_240612,[]
|
748 |
+
qwen1.5_0.5b,0.0428921568627451,biggen_mwr,biggen_240612,[]
|
749 |
+
qwen1.5_0.5b_chat,0.07965686274509803,biggen_mwr,biggen_240612,[]
|
750 |
+
qwen1.5_1.8b,0.12867647058823528,biggen_mwr,biggen_240612,[]
|
751 |
+
qwen1.5_1.8b_chat,0.21691176470588236,biggen_mwr,biggen_240612,[]
|
752 |
+
qwen1.5_14b,0.3946078431372549,biggen_mwr,biggen_240612,[]
|
753 |
+
qwen1.5_14b_chat,0.7267156862745098,biggen_mwr,biggen_240612,[]
|
754 |
+
qwen1.5_32b,0.4791666666666667,biggen_mwr,biggen_240612,[]
|
755 |
+
qwen1.5_32b_chat,0.8149509803921569,biggen_mwr,biggen_240612,[]
|
756 |
+
qwen1.5_4b,0.21323529411764705,biggen_mwr,biggen_240612,[]
|
757 |
+
qwen1.5_4b_chat,0.29411764705882354,biggen_mwr,biggen_240612,[]
|
758 |
+
qwen1.5_72b,0.5294117647058824,biggen_mwr,biggen_240612,[]
|
759 |
+
qwen1.5_72b_chat,0.8713235294117647,biggen_mwr,biggen_240612,[]
|
760 |
+
qwen1.5_7b,0.2610294117647059,biggen_mwr,biggen_240612,[]
|
761 |
+
qwen1.5_7b_chat,0.6580882352941176,biggen_mwr,biggen_240612,[]
|
762 |
+
qwen_110b_chat,0.8848039215686274,biggen_mwr,biggen_240612,[]
|
763 |
+
solar_10.7b_instruct_v1.0,0.6862745098039216,biggen_mwr,biggen_240612,[]
|
764 |
+
solar_10.7b_v1.0,0.43995098039215685,biggen_mwr,biggen_240612,[]
|
765 |
+
starling_lm_7b_alpha,0.6139705882352942,biggen_mwr,biggen_240612,[]
|
766 |
+
starling_lm_7b_beta,0.7573529411764706,biggen_mwr,biggen_240612,[]
|
767 |
+
tulu_2_13b,0.4313725490196078,biggen_mwr,biggen_240612,[]
|
768 |
+
tulu_2_7b,0.3553921568627451,biggen_mwr,biggen_240612,[]
|
769 |
+
tulu_2_dpo_13b,0.5833333333333333,biggen_mwr,biggen_240612,[]
|
770 |
+
tulu_2_dpo_70b,0.7708333333333334,biggen_mwr,biggen_240612,[]
|
771 |
+
tulu_2_dpo_7b,0.4767156862745098,biggen_mwr,biggen_240612,[]
|
772 |
+
yi_34b,0.46078431372549017,biggen_mwr,biggen_240612,[]
|
773 |
+
yi_34b_chat,0.7720588235294118,biggen_mwr,biggen_240612,[]
|
774 |
+
yi_6b,0.17892156862745098,biggen_mwr,biggen_240612,[]
|
775 |
+
yi_6b_chat,0.4117647058823529,biggen_mwr,biggen_240612,[]
|
776 |
+
zephyr_7b_beta,0.6200980392156863,biggen_mwr,biggen_240612,[]
|
777 |
+
zephyr_orpo_141b_a35b_v0.1_awq,0.6311274509803921,biggen_mwr,biggen_240612,[]
|
778 |
+
gpt_4o_0513,1293.0,arena_elo,wildbench_240612,[]
|
779 |
+
gpt_4_turbo_0409,1251.0,arena_elo,wildbench_240612,[]
|
780 |
+
gpt_4_turbo_0125,1239.0,arena_elo,wildbench_240612,[]
|
781 |
+
llama_3_70b_inst,1213.0,arena_elo,wildbench_240612,[]
|
782 |
+
claude_3_opus,1232.0,arena_elo,wildbench_240612,[]
|
783 |
+
claude_3_sonnet,1187.0,arena_elo,wildbench_240612,[]
|
784 |
+
qwen1.5_72b_chat,1143.0,arena_elo,wildbench_240612,[]
|
785 |
+
command_r_plus,1155.0,arena_elo,wildbench_240612,[]
|
786 |
+
claude_3_haiku,1169.0,arena_elo,wildbench_240612,[]
|
787 |
+
mistral_large,1158.0,arena_elo,wildbench_240612,[]
|
788 |
+
starlinglm_7b_beta,1111.0,arena_elo,wildbench_240612,[]
|
789 |
+
llama_3_8b_inst,1144.0,arena_elo,wildbench_240612,[]
|
790 |
+
command_r,1106.0,arena_elo,wildbench_240612,[]
|
791 |
+
mixtral_8x7b_inst,1114.0,arena_elo,wildbench_240612,[]
|
792 |
+
dbrx_instruct,1106.0,arena_elo,wildbench_240612,[]
|
793 |
+
mistral_7b_inst_v0.2,1071.0,arena_elo,wildbench_240612,[]
|
794 |
+
tulu_2_dpo_70b,1099.0,arena_elo,wildbench_240612,[]
|
795 |
+
llama_2_70b_chat,1070.0,arena_elo,wildbench_240612,[]
|
796 |
+
qwen1.5_7b_chat,1059.0,arena_elo,wildbench_240612,[]
|
797 |
+
gpt_3.5_turbo_0125,1105.0,arena_elo,wildbench_240612,[]
|
798 |
+
llama_2_7b_chat,1012.0,arena_elo,wildbench_240612,[]
|
799 |
+
gemma_7b_it,1047.0,arena_elo,wildbench_240612,[]
|
800 |
+
gemma_2b_it,980.0,arena_elo,wildbench_240612,[]
|
801 |
+
gpt_4_turbo_0409,82.6,arena_hard,wildbench_240612,[]
|
802 |
+
gpt_4_turbo_0125,78.0,arena_hard,wildbench_240612,[]
|
803 |
+
llama_3_70b_inst,41.1,arena_hard,wildbench_240612,[]
|
804 |
+
claude_3_opus,60.4,arena_hard,wildbench_240612,[]
|
805 |
+
llama3_inst_8b_simpo,33.8,arena_hard,wildbench_240612,[]
|
806 |
+
claude_3_sonnet,46.8,arena_hard,wildbench_240612,[]
|
807 |
+
qwen1.5_72b_chat,36.1,arena_hard,wildbench_240612,[]
|
808 |
+
command_r_plus,33.1,arena_hard,wildbench_240612,[]
|
809 |
+
claude_3_haiku,41.5,arena_hard,wildbench_240612,[]
|
810 |
+
mistral_large,37.7,arena_hard,wildbench_240612,[]
|
811 |
+
starlinglm_7b_beta,23.0,arena_hard,wildbench_240612,[]
|
812 |
+
llama_3_8b_inst,20.6,arena_hard,wildbench_240612,[]
|
813 |
+
command_r,17.0,arena_hard,wildbench_240612,[]
|
814 |
+
mixtral_8x7b_inst,23.4,arena_hard,wildbench_240612,[]
|
815 |
+
dbrx_instruct,23.9,arena_hard,wildbench_240612,[]
|
816 |
+
tulu_2_dpo_70b,15.0,arena_hard,wildbench_240612,[]
|
817 |
+
llama_2_70b_chat,11.6,arena_hard,wildbench_240612,[]
|
818 |
+
gpt_3.5_turbo_0125,23.3,arena_hard,wildbench_240612,[]
|
819 |
+
llama_2_7b_chat,4.6,arena_hard,wildbench_240612,[]
|
820 |
+
gemma_7b_it,7.5,arena_hard,wildbench_240612,[]
|
821 |
+
gemma_2b_it,3.0,arena_hard,wildbench_240612,[]
|
822 |
+
gpt_4o_0513,57.5,alpacaeval2_lc,wildbench_240612,[]
|
823 |
+
gpt_4_turbo_0409,55.0,alpacaeval2_lc,wildbench_240612,[]
|
824 |
+
llama_3_70b_inst,34.4,alpacaeval2_lc,wildbench_240612,[]
|
825 |
+
claude_3_opus,40.5,alpacaeval2_lc,wildbench_240612,[]
|
826 |
+
llama3_inst_8b_simpo,44.7,alpacaeval2_lc,wildbench_240612,[]
|
827 |
+
claude_3_sonnet,34.9,alpacaeval2_lc,wildbench_240612,[]
|
828 |
+
qwen1.5_72b_chat,36.6,alpacaeval2_lc,wildbench_240612,[]
|
829 |
+
mistral_large,32.7,alpacaeval2_lc,wildbench_240612,[]
|
830 |
+
llama_3_8b_inst,22.9,alpacaeval2_lc,wildbench_240612,[]
|
831 |
+
mixtral_8x7b_inst,23.7,alpacaeval2_lc,wildbench_240612,[]
|
832 |
+
dbrx_instruct,25.4,alpacaeval2_lc,wildbench_240612,[]
|
833 |
+
mistral_7b_inst_v0.2,17.1,alpacaeval2_lc,wildbench_240612,[]
|
834 |
+
tulu_2_dpo_70b,21.2,alpacaeval2_lc,wildbench_240612,[]
|
835 |
+
llama_2_70b_chat,14.7,alpacaeval2_lc,wildbench_240612,[]
|
836 |
+
qwen1.5_7b_chat,14.7,alpacaeval2_lc,wildbench_240612,[]
|
837 |
+
llama_2_7b_chat,5.4,alpacaeval2_lc,wildbench_240612,[]
|
838 |
+
gemma_7b_it,10.4,alpacaeval2_lc,wildbench_240612,[]
|
839 |
+
gemma_2b_it,5.4,alpacaeval2_lc,wildbench_240612,[]
|
840 |
+
gpt_4o_0513,51.3,alpacav2,wildbench_240612,[]
|
841 |
+
gpt_4_turbo_0409,46.1,alpacav2,wildbench_240612,[]
|
842 |
+
llama_3_70b_inst,33.2,alpacav2,wildbench_240612,[]
|
843 |
+
claude_3_opus,29.1,alpacav2,wildbench_240612,[]
|
844 |
+
llama3_inst_8b_simpo,40.5,alpacav2,wildbench_240612,[]
|
845 |
+
claude_3_sonnet,25.6,alpacav2,wildbench_240612,[]
|
846 |
+
qwen1.5_72b_chat,26.5,alpacav2,wildbench_240612,[]
|
847 |
+
mistral_large,21.4,alpacav2,wildbench_240612,[]
|
848 |
+
llama_3_8b_inst,22.6,alpacav2,wildbench_240612,[]
|
849 |
+
mixtral_8x7b_inst,18.3,alpacav2,wildbench_240612,[]
|
850 |
+
dbrx_instruct,18.4,alpacav2,wildbench_240612,[]
|
851 |
+
mistral_7b_inst_v0.2,14.7,alpacav2,wildbench_240612,[]
|
852 |
+
tulu_2_dpo_70b,16.0,alpacav2,wildbench_240612,[]
|
853 |
+
llama_2_70b_chat,13.9,alpacav2,wildbench_240612,[]
|
854 |
+
qwen1.5_7b_chat,11.8,alpacav2,wildbench_240612,[]
|
855 |
+
llama_2_7b_chat,5.0,alpacav2,wildbench_240612,[]
|
856 |
+
gemma_7b_it,6.9,alpacav2,wildbench_240612,[]
|
857 |
+
gemma_2b_it,3.4,alpacav2,wildbench_240612,[]
|
858 |
+
pythia_1b,31.4,arc_c,olmes_260624,[]
|
859 |
+
olmo_1b,38.6,arc_c,olmes_260624,[]
|
860 |
+
tinyllama_1.1b,38.1,arc_c,olmes_260624,[]
|
861 |
+
pythia_6.7b,44.6,arc_c,olmes_260624,[]
|
862 |
+
rpj_incite_7b,45.3,arc_c,olmes_260624,[]
|
863 |
+
stablelm2_1.6b,50.6,arc_c,olmes_260624,[]
|
864 |
+
olmo_7b,46.4,arc_c,olmes_260624,[]
|
865 |
+
mpt_7b,45.7,arc_c,olmes_260624,[]
|
866 |
+
falcon_7b,49.7,arc_c,olmes_260624,[]
|
867 |
+
llama2_7b,54.2,arc_c,olmes_260624,[]
|
868 |
+
llama2_13b,67.3,arc_c,olmes_260624,[]
|
869 |
+
olmo_1.7_7b,66.9,arc_c,olmes_260624,[]
|
870 |
+
llama3_8b,79.3,arc_c,olmes_260624,[]
|
871 |
+
mistral_7b_v0.1,78.6,arc_c,olmes_260624,[]
|
872 |
+
llama3_70b,93.7,arc_c,olmes_260624,[]
|
873 |
+
pythia_1b,31.1,mmlu,olmes_260624,[]
|
874 |
+
olmo_1b,33.4,mmlu,olmes_260624,[]
|
875 |
+
tinyllama_1.1b,33.6,mmlu,olmes_260624,[]
|
876 |
+
pythia_6.7b,37.7,mmlu,olmes_260624,[]
|
877 |
+
rpj_incite_7b,40.1,mmlu,olmes_260624,[]
|
878 |
+
stablelm2_1.6b,40.4,mmlu,olmes_260624,[]
|
879 |
+
olmo_7b,40.5,mmlu,olmes_260624,[]
|
880 |
+
mpt_7b,40.6,mmlu,olmes_260624,[]
|
881 |
+
falcon_7b,42.1,mmlu,olmes_260624,[]
|
882 |
+
llama2_7b,46.2,mmlu,olmes_260624,[]
|
883 |
+
llama2_13b,55.8,mmlu,olmes_260624,[]
|
884 |
+
olmo_1.7_7b,54.4,mmlu,olmes_260624,[]
|
885 |
+
llama3_8b,66.6,mmlu,olmes_260624,[]
|
886 |
+
mistral_7b_v0.1,64.0,mmlu,olmes_260624,[]
|
887 |
+
llama3_70b,79.8,mmlu,olmes_260624,[]
|
888 |
+
pythia_1b,49.0,olmes_average,olmes_260624,[]
|
889 |
+
olmo_1b,55.1,olmes_average,olmes_260624,[]
|
890 |
+
tinyllama_1.1b,55.4,olmes_average,olmes_260624,[]
|
891 |
+
pythia_6.7b,59.1,olmes_average,olmes_260624,[]
|
892 |
+
rpj_incite_7b,62.8,olmes_average,olmes_260624,[]
|
893 |
+
stablelm2_1.6b,65.1,olmes_average,olmes_260624,[]
|
894 |
+
olmo_7b,65.3,olmes_average,olmes_260624,[]
|
895 |
+
mpt_7b,65.6,olmes_average,olmes_260624,[]
|
896 |
+
falcon_7b,66.9,olmes_average,olmes_260624,[]
|
897 |
+
llama2_7b,69.0,olmes_average,olmes_260624,[]
|
898 |
+
llama2_13b,74.0,olmes_average,olmes_260624,[]
|
899 |
+
olmo_1.7_7b,75.5,olmes_average,olmes_260624,[]
|
900 |
+
llama3_8b,78.7,olmes_average,olmes_260624,[]
|
901 |
+
mistral_7b_v0.1,79.1,olmes_average,olmes_260624,[]
|
902 |
+
llama3_70b,88.4,olmes_average,olmes_260624,[]
|
903 |
+
llama_2_70b,0.3753,mmlu_pro,mmlu_pro_240610,[]
|
904 |
+
llama_3_8b,0.3536,mmlu_pro,mmlu_pro_240610,[]
|
905 |
+
deepseekmath_instruct,0.353,mmlu_pro,mmlu_pro_240610,[]
|
906 |
+
gemma_7b,0.3373,mmlu_pro,mmlu_pro_240610,[]
|
907 |
+
mistral_7b_v0.1,0.3088,mmlu_pro,mmlu_pro_240610,[]
|
908 |
+
mistral_7b_instruct_v0.2,0.3084,mmlu_pro,mmlu_pro_240610,[]
|
909 |
+
mistral_7b_v0.2,0.3043,mmlu_pro,mmlu_pro_240610,[]
|
910 |
+
qwen1.5_7b_chat,0.2906,mmlu_pro,mmlu_pro_240610,[]
|
911 |
+
yi_6b_chat,0.2884,mmlu_pro,mmlu_pro_240610,[]
|
912 |
+
yi_6b,0.2651,mmlu_pro,mmlu_pro_240610,[]
|
913 |
+
mistral_7b_instruct_v0.1,0.2575,mmlu_pro,mmlu_pro_240610,[]
|
914 |
+
llama_2_13b,0.2534,mmlu_pro,mmlu_pro_240610,[]
|
915 |
+
llemma_7b,0.2345,mmlu_pro,mmlu_pro_240610,[]
|
916 |
+
llama_2_7b,0.2032,mmlu_pro,mmlu_pro_240610,[]
|
917 |
+
gpt_4o,0.7255,mmlu_pro,mmlu_pro_240610,[]
|
918 |
+
claude_3_opus,0.6845,mmlu_pro,mmlu_pro_240610,[]
|
919 |
+
gpt_4_turbo,0.6371,mmlu_pro,mmlu_pro_240610,[]
|
920 |
+
gemini_1.5_flash,0.5912,mmlu_pro,mmlu_pro_240610,[]
|
921 |
+
yi_large,0.5753,mmlu_pro,mmlu_pro_240610,[]
|
922 |
+
claude_3_sonnet,0.568,mmlu_pro,mmlu_pro_240610,[]
|
923 |
+
llama_3_70b_instruct,0.562,mmlu_pro,mmlu_pro_240610,[]
|
924 |
+
deepseek_v2,0.5481,mmlu_pro,mmlu_pro_240610,[]
|
925 |
+
phi_3_medium_4k_instruct,0.5348,mmlu_pro,mmlu_pro_240610,[]
|
926 |
+
llama_3_70b,0.5278,mmlu_pro,mmlu_pro_240610,[]
|
927 |
+
qwen1.5_72b_chat,0.5162,mmlu_pro,mmlu_pro_240610,[]
|
928 |
+
mammoth2_8x7b_plus,0.504,mmlu_pro,mmlu_pro_240610,[]
|
929 |
+
qwen1.5_110b,0.4993,mmlu_pro,mmlu_pro_240610,[]
|
930 |
+
mammoth2_8b_plus,0.4335,mmlu_pro,mmlu_pro_240610,[]
|
931 |
+
mixtral_8x7b_instruct_v0.1,0.4327,mmlu_pro,mmlu_pro_240610,[]
|
932 |
+
phi_3_mini_4k_instruct,0.4317,mmlu_pro,mmlu_pro_240610,[]
|
933 |
+
yi_34b,0.4303,mmlu_pro,mmlu_pro_240610,[]
|
934 |
+
mixtral_8x7b_v0.1,0.4103,mmlu_pro,mmlu_pro_240610,[]
|
935 |
+
llama_3_8b_instruct,0.4098,mmlu_pro,mmlu_pro_240610,[]
|
936 |
+
mammoth2_7b_plus,0.4085,mmlu_pro,mmlu_pro_240610,[]
|
937 |
+
qwen1.5_14b_chat,0.3802,mmlu_pro,mmlu_pro_240610,[]
|
938 |
+
c4ai_command_r_v01,0.379,mmlu_pro,mmlu_pro_240610,[]
|
assets/livebench.csv
ADDED
@@ -0,0 +1,365 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
,model,scenario,score,aggragated_from,source
|
2 |
+
0,claude_3_5_sonnet_20240620,livebench_lb,61.16,[],livebench_240701
|
3 |
+
1,gpt_4o_2024_05_13,livebench_lb,54.96,[],livebench_240701
|
4 |
+
2,gpt_4_turbo_2024_04_09,livebench_lb,53.0,[],livebench_240701
|
5 |
+
3,gpt_4_1106_preview,livebench_lb,52.17,[],livebench_240701
|
6 |
+
4,claude_3_opus_20240229,livebench_lb,50.75,[],livebench_240701
|
7 |
+
5,gpt_4_0125_preview,livebench_lb,49.39,[],livebench_240701
|
8 |
+
6,deepseek_coder_v2,livebench_lb,46.79,[],livebench_240701
|
9 |
+
7,gemini_1.5_pro_api_0514,livebench_lb,44.35,[],livebench_240701
|
10 |
+
8,gemma_2_27b_it,livebench_lb,41.22,[],livebench_240701
|
11 |
+
9,gemini_1.5_flash_api_0514,livebench_lb,40.89,[],livebench_240701
|
12 |
+
10,qwen2_72b_instruct,livebench_lb,40.16,[],livebench_240701
|
13 |
+
11,acm_rewrite_qwen2_72b_chat,livebench_lb,39.6,[],livebench_240701
|
14 |
+
12,mistral_large_2402,livebench_lb,38.92,[],livebench_240701
|
15 |
+
13,deepseek_chat_v2,livebench_lb,38.39,[],livebench_240701
|
16 |
+
14,claude_3_sonnet_20240229,livebench_lb,38.08,[],livebench_240701
|
17 |
+
15,meta_llama_3_70b_instruct,livebench_lb,37.38,[],livebench_240701
|
18 |
+
16,claude_3_haiku_20240307,livebench_lb,35.32,[],livebench_240701
|
19 |
+
17,mixtral_8x22b_instruct_v0.1,livebench_lb,34.84,[],livebench_240701
|
20 |
+
18,gpt_3.5_turbo_0125,livebench_lb,34.43,[],livebench_240701
|
21 |
+
19,gpt_3.5_turbo_1106,livebench_lb,34.14,[],livebench_240701
|
22 |
+
20,command_r_plus,livebench_lb,32.86,[],livebench_240701
|
23 |
+
21,mistral_small_2402,livebench_lb,32.8,[],livebench_240701
|
24 |
+
22,gemma_2_9b_it,livebench_lb,31.57,[],livebench_240701
|
25 |
+
23,phi_3_medium_4k_instruct,livebench_lb,30.33,[],livebench_240701
|
26 |
+
24,phi_3_medium_128k_instruct,livebench_lb,29.64,[],livebench_240701
|
27 |
+
25,deepseek_coder_v2_lite_instruct,livebench_lb,29.15,[],livebench_240701
|
28 |
+
26,qwen1.5_110b_chat,livebench_lb,28.96,[],livebench_240701
|
29 |
+
27,qwen1.5_72b_chat,livebench_lb,28.89,[],livebench_240701
|
30 |
+
28,command_r,livebench_lb,27.23,[],livebench_240701
|
31 |
+
29,phi_3_small_128k_instruct,livebench_lb,27.19,[],livebench_240701
|
32 |
+
30,meta_llama_3_8b_instruct,livebench_lb,26.67,[],livebench_240701
|
33 |
+
31,qwen2_7b_instruct,livebench_lb,26.45,[],livebench_240701
|
34 |
+
32,phi_3_small_8k_instruct,livebench_lb,26.24,[],livebench_240701
|
35 |
+
33,openhermes_2.5_mistral_7b,livebench_lb,23.3,[],livebench_240701
|
36 |
+
34,mixtral_8x7b_instruct_v0.1,livebench_lb,22.5,[],livebench_240701
|
37 |
+
35,mistral_7b_instruct_v0.2,livebench_lb,19.33,[],livebench_240701
|
38 |
+
36,phi_3_mini_4k_instruct,livebench_lb,19.27,[],livebench_240701
|
39 |
+
37,zephyr_7b_alpha,livebench_lb,19.22,[],livebench_240701
|
40 |
+
38,phi_3_mini_128k_instruct,livebench_lb,18.04,[],livebench_240701
|
41 |
+
39,zephyr_7b_beta,livebench_lb,17.32,[],livebench_240701
|
42 |
+
40,deepseek_v2_lite_chat,livebench_lb,17.14,[],livebench_240701
|
43 |
+
41,qwen1.5_7b_chat,livebench_lb,16.5,[],livebench_240701
|
44 |
+
42,starling_lm_7b_beta,livebench_lb,16.44,[],livebench_240701
|
45 |
+
43,vicuna_7b_v1.5_16k,livebench_lb,13.71,[],livebench_240701
|
46 |
+
44,vicuna_7b_v1.5,livebench_lb,11.73,[],livebench_240701
|
47 |
+
45,qwen1.5_4b_chat,livebench_lb,11.13,[],livebench_240701
|
48 |
+
46,llama_2_7b_chat,livebench_lb,10.25,[],livebench_240701
|
49 |
+
47,qwen2_1.5b_instruct,livebench_lb,9.96,[],livebench_240701
|
50 |
+
48,yi_6b_chat,livebench_lb,8.79,[],livebench_240701
|
51 |
+
49,qwen2_0.5b_instruct,livebench_lb,6.78,[],livebench_240701
|
52 |
+
50,qwen1.5_1.8b_chat,livebench_lb,6.09,[],livebench_240701
|
53 |
+
51,qwen1.5_0.5b_chat,livebench_lb,5.26,[],livebench_240701
|
54 |
+
52,claude_3_5_sonnet_20240620,reasoning_lb,64.0,[],livebench_240701
|
55 |
+
53,gpt_4o_2024_05_13,reasoning_lb,55.0,[],livebench_240701
|
56 |
+
54,gpt_4_turbo_2024_04_09,reasoning_lb,54.0,[],livebench_240701
|
57 |
+
55,gpt_4_1106_preview,reasoning_lb,52.0,[],livebench_240701
|
58 |
+
56,claude_3_opus_20240229,reasoning_lb,41.0,[],livebench_240701
|
59 |
+
57,gpt_4_0125_preview,reasoning_lb,48.0,[],livebench_240701
|
60 |
+
58,deepseek_coder_v2,reasoning_lb,49.0,[],livebench_240701
|
61 |
+
59,gemini_1.5_pro_api_0514,reasoning_lb,33.0,[],livebench_240701
|
62 |
+
60,gemma_2_27b_it,reasoning_lb,31.0,[],livebench_240701
|
63 |
+
61,gemini_1.5_flash_api_0514,reasoning_lb,30.0,[],livebench_240701
|
64 |
+
62,qwen2_72b_instruct,reasoning_lb,42.0,[],livebench_240701
|
65 |
+
63,acm_rewrite_qwen2_72b_chat,reasoning_lb,37.0,[],livebench_240701
|
66 |
+
64,mistral_large_2402,reasoning_lb,35.0,[],livebench_240701
|
67 |
+
65,deepseek_chat_v2,reasoning_lb,29.0,[],livebench_240701
|
68 |
+
66,claude_3_sonnet_20240229,reasoning_lb,26.0,[],livebench_240701
|
69 |
+
67,meta_llama_3_70b_instruct,reasoning_lb,31.0,[],livebench_240701
|
70 |
+
68,claude_3_haiku_20240307,reasoning_lb,26.0,[],livebench_240701
|
71 |
+
69,mixtral_8x22b_instruct_v0.1,reasoning_lb,29.0,[],livebench_240701
|
72 |
+
70,gpt_3.5_turbo_0125,reasoning_lb,26.0,[],livebench_240701
|
73 |
+
71,gpt_3.5_turbo_1106,reasoning_lb,28.0,[],livebench_240701
|
74 |
+
72,command_r_plus,reasoning_lb,32.0,[],livebench_240701
|
75 |
+
73,mistral_small_2402,reasoning_lb,28.0,[],livebench_240701
|
76 |
+
74,gemma_2_9b_it,reasoning_lb,19.0,[],livebench_240701
|
77 |
+
75,phi_3_medium_4k_instruct,reasoning_lb,35.0,[],livebench_240701
|
78 |
+
76,phi_3_medium_128k_instruct,reasoning_lb,31.0,[],livebench_240701
|
79 |
+
77,deepseek_coder_v2_lite_instruct,reasoning_lb,22.0,[],livebench_240701
|
80 |
+
78,qwen1.5_110b_chat,reasoning_lb,26.0,[],livebench_240701
|
81 |
+
79,qwen1.5_72b_chat,reasoning_lb,21.0,[],livebench_240701
|
82 |
+
80,command_r,reasoning_lb,28.0,[],livebench_240701
|
83 |
+
81,phi_3_small_128k_instruct,reasoning_lb,36.0,[],livebench_240701
|
84 |
+
82,meta_llama_3_8b_instruct,reasoning_lb,25.0,[],livebench_240701
|
85 |
+
83,qwen2_7b_instruct,reasoning_lb,20.0,[],livebench_240701
|
86 |
+
84,phi_3_small_8k_instruct,reasoning_lb,23.0,[],livebench_240701
|
87 |
+
85,openhermes_2.5_mistral_7b,reasoning_lb,17.0,[],livebench_240701
|
88 |
+
86,mixtral_8x7b_instruct_v0.1,reasoning_lb,18.0,[],livebench_240701
|
89 |
+
87,mistral_7b_instruct_v0.2,reasoning_lb,13.0,[],livebench_240701
|
90 |
+
88,phi_3_mini_4k_instruct,reasoning_lb,19.0,[],livebench_240701
|
91 |
+
89,zephyr_7b_alpha,reasoning_lb,17.0,[],livebench_240701
|
92 |
+
90,phi_3_mini_128k_instruct,reasoning_lb,10.0,[],livebench_240701
|
93 |
+
91,zephyr_7b_beta,reasoning_lb,16.0,[],livebench_240701
|
94 |
+
92,deepseek_v2_lite_chat,reasoning_lb,13.0,[],livebench_240701
|
95 |
+
93,qwen1.5_7b_chat,reasoning_lb,13.0,[],livebench_240701
|
96 |
+
94,starling_lm_7b_beta,reasoning_lb,19.0,[],livebench_240701
|
97 |
+
95,vicuna_7b_v1.5_16k,reasoning_lb,15.0,[],livebench_240701
|
98 |
+
96,vicuna_7b_v1.5,reasoning_lb,12.0,[],livebench_240701
|
99 |
+
97,qwen1.5_4b_chat,reasoning_lb,13.0,[],livebench_240701
|
100 |
+
98,llama_2_7b_chat,reasoning_lb,5.0,[],livebench_240701
|
101 |
+
99,qwen2_1.5b_instruct,reasoning_lb,8.0,[],livebench_240701
|
102 |
+
100,yi_6b_chat,reasoning_lb,8.0,[],livebench_240701
|
103 |
+
101,qwen2_0.5b_instruct,reasoning_lb,3.0,[],livebench_240701
|
104 |
+
102,qwen1.5_1.8b_chat,reasoning_lb,5.0,[],livebench_240701
|
105 |
+
103,qwen1.5_0.5b_chat,reasoning_lb,4.0,[],livebench_240701
|
106 |
+
104,claude_3_5_sonnet_20240620,coding_lb,63.21,[],livebench_240701
|
107 |
+
105,gpt_4o_2024_05_13,coding_lb,46.37,[],livebench_240701
|
108 |
+
106,gpt_4_turbo_2024_04_09,coding_lb,47.05,[],livebench_240701
|
109 |
+
107,gpt_4_1106_preview,coding_lb,44.37,[],livebench_240701
|
110 |
+
108,claude_3_opus_20240229,coding_lb,40.05,[],livebench_240701
|
111 |
+
109,gpt_4_0125_preview,coding_lb,44.05,[],livebench_240701
|
112 |
+
110,deepseek_coder_v2,coding_lb,41.05,[],livebench_240701
|
113 |
+
111,gemini_1.5_pro_api_0514,coding_lb,32.79,[],livebench_240701
|
114 |
+
112,gemma_2_27b_it,coding_lb,36.74,[],livebench_240701
|
115 |
+
113,gemini_1.5_flash_api_0514,coding_lb,39.05,[],livebench_240701
|
116 |
+
114,qwen2_72b_instruct,coding_lb,31.79,[],livebench_240701
|
117 |
+
115,acm_rewrite_qwen2_72b_chat,coding_lb,39.05,[],livebench_240701
|
118 |
+
116,mistral_large_2402,coding_lb,26.84,[],livebench_240701
|
119 |
+
117,deepseek_chat_v2,coding_lb,33.47,[],livebench_240701
|
120 |
+
118,claude_3_sonnet_20240229,coding_lb,25.21,[],livebench_240701
|
121 |
+
119,meta_llama_3_70b_instruct,coding_lb,20.95,[],livebench_240701
|
122 |
+
120,claude_3_haiku_20240307,coding_lb,24.53,[],livebench_240701
|
123 |
+
121,mixtral_8x22b_instruct_v0.1,coding_lb,33.11,[],livebench_240701
|
124 |
+
122,gpt_3.5_turbo_0125,coding_lb,29.16,[],livebench_240701
|
125 |
+
123,gpt_3.5_turbo_1106,coding_lb,26.84,[],livebench_240701
|
126 |
+
124,command_r_plus,coding_lb,20.26,[],livebench_240701
|
127 |
+
125,mistral_small_2402,coding_lb,24.21,[],livebench_240701
|
128 |
+
126,gemma_2_9b_it,coding_lb,22.21,[],livebench_240701
|
129 |
+
127,phi_3_medium_4k_instruct,coding_lb,20.58,[],livebench_240701
|
130 |
+
128,phi_3_medium_128k_instruct,coding_lb,21.58,[],livebench_240701
|
131 |
+
129,deepseek_coder_v2_lite_instruct,coding_lb,26.84,[],livebench_240701
|
132 |
+
130,qwen1.5_110b_chat,coding_lb,22.21,[],livebench_240701
|
133 |
+
131,qwen1.5_72b_chat,coding_lb,22.89,[],livebench_240701
|
134 |
+
132,command_r,coding_lb,14.95,[],livebench_240701
|
135 |
+
133,phi_3_small_128k_instruct,coding_lb,25.84,[],livebench_240701
|
136 |
+
134,meta_llama_3_8b_instruct,coding_lb,18.26,[],livebench_240701
|
137 |
+
135,qwen2_7b_instruct,coding_lb,29.21,[],livebench_240701
|
138 |
+
136,phi_3_small_8k_instruct,coding_lb,19.58,[],livebench_240701
|
139 |
+
137,openhermes_2.5_mistral_7b,coding_lb,11.63,[],livebench_240701
|
140 |
+
138,mixtral_8x7b_instruct_v0.1,coding_lb,11.32,[],livebench_240701
|
141 |
+
139,mistral_7b_instruct_v0.2,coding_lb,11.63,[],livebench_240701
|
142 |
+
140,phi_3_mini_4k_instruct,coding_lb,14.95,[],livebench_240701
|
143 |
+
141,zephyr_7b_alpha,coding_lb,11.32,[],livebench_240701
|
144 |
+
142,phi_3_mini_128k_instruct,coding_lb,11.63,[],livebench_240701
|
145 |
+
143,zephyr_7b_beta,coding_lb,8.32,[],livebench_240701
|
146 |
+
144,deepseek_v2_lite_chat,coding_lb,8.63,[],livebench_240701
|
147 |
+
145,qwen1.5_7b_chat,coding_lb,6.63,[],livebench_240701
|
148 |
+
146,starling_lm_7b_beta,coding_lb,18.26,[],livebench_240701
|
149 |
+
147,vicuna_7b_v1.5_16k,coding_lb,1.32,[],livebench_240701
|
150 |
+
148,vicuna_7b_v1.5,coding_lb,1.0,[],livebench_240701
|
151 |
+
149,qwen1.5_4b_chat,coding_lb,4.0,[],livebench_240701
|
152 |
+
150,llama_2_7b_chat,coding_lb,0.0,[],livebench_240701
|
153 |
+
151,qwen2_1.5b_instruct,coding_lb,5.63,[],livebench_240701
|
154 |
+
152,yi_6b_chat,coding_lb,1.32,[],livebench_240701
|
155 |
+
153,qwen2_0.5b_instruct,coding_lb,2.0,[],livebench_240701
|
156 |
+
154,qwen1.5_1.8b_chat,coding_lb,0.0,[],livebench_240701
|
157 |
+
155,qwen1.5_0.5b_chat,coding_lb,0.0,[],livebench_240701
|
158 |
+
156,claude_3_5_sonnet_20240620,mathematics_lb,53.75,[],livebench_240701
|
159 |
+
157,gpt_4o_2024_05_13,mathematics_lb,49.88,[],livebench_240701
|
160 |
+
158,gpt_4_turbo_2024_04_09,mathematics_lb,48.99,[],livebench_240701
|
161 |
+
159,gpt_4_1106_preview,mathematics_lb,47.55,[],livebench_240701
|
162 |
+
160,claude_3_opus_20240229,mathematics_lb,46.54,[],livebench_240701
|
163 |
+
161,gpt_4_0125_preview,mathematics_lb,42.75,[],livebench_240701
|
164 |
+
162,deepseek_coder_v2,mathematics_lb,52.19,[],livebench_240701
|
165 |
+
163,gemini_1.5_pro_api_0514,mathematics_lb,42.07,[],livebench_240701
|
166 |
+
164,gemma_2_27b_it,mathematics_lb,36.23,[],livebench_240701
|
167 |
+
165,gemini_1.5_flash_api_0514,mathematics_lb,38.54,[],livebench_240701
|
168 |
+
166,qwen2_72b_instruct,mathematics_lb,43.44,[],livebench_240701
|
169 |
+
167,acm_rewrite_qwen2_72b_chat,mathematics_lb,40.32,[],livebench_240701
|
170 |
+
168,mistral_large_2402,mathematics_lb,32.2,[],livebench_240701
|
171 |
+
169,deepseek_chat_v2,mathematics_lb,33.23,[],livebench_240701
|
172 |
+
170,claude_3_sonnet_20240229,mathematics_lb,29.65,[],livebench_240701
|
173 |
+
171,meta_llama_3_70b_instruct,mathematics_lb,32.31,[],livebench_240701
|
174 |
+
172,claude_3_haiku_20240307,mathematics_lb,25.72,[],livebench_240701
|
175 |
+
173,mixtral_8x22b_instruct_v0.1,mathematics_lb,26.94,[],livebench_240701
|
176 |
+
174,gpt_3.5_turbo_0125,mathematics_lb,25.54,[],livebench_240701
|
177 |
+
175,gpt_3.5_turbo_1106,mathematics_lb,28.13,[],livebench_240701
|
178 |
+
176,command_r_plus,mathematics_lb,24.85,[],livebench_240701
|
179 |
+
177,mistral_small_2402,mathematics_lb,26.76,[],livebench_240701
|
180 |
+
178,gemma_2_9b_it,mathematics_lb,23.98,[],livebench_240701
|
181 |
+
179,phi_3_medium_4k_instruct,mathematics_lb,27.54,[],livebench_240701
|
182 |
+
180,phi_3_medium_128k_instruct,mathematics_lb,24.25,[],livebench_240701
|
183 |
+
181,deepseek_coder_v2_lite_instruct,mathematics_lb,34.09,[],livebench_240701
|
184 |
+
182,qwen1.5_110b_chat,mathematics_lb,25.58,[],livebench_240701
|
185 |
+
183,qwen1.5_72b_chat,mathematics_lb,26.82,[],livebench_240701
|
186 |
+
184,command_r,mathematics_lb,16.92,[],livebench_240701
|
187 |
+
185,phi_3_small_128k_instruct,mathematics_lb,24.84,[],livebench_240701
|
188 |
+
186,meta_llama_3_8b_instruct,mathematics_lb,17.58,[],livebench_240701
|
189 |
+
187,qwen2_7b_instruct,mathematics_lb,25.83,[],livebench_240701
|
190 |
+
188,phi_3_small_8k_instruct,mathematics_lb,24.15,[],livebench_240701
|
191 |
+
189,openhermes_2.5_mistral_7b,mathematics_lb,20.1,[],livebench_240701
|
192 |
+
190,mixtral_8x7b_instruct_v0.1,mathematics_lb,18.97,[],livebench_240701
|
193 |
+
191,mistral_7b_instruct_v0.2,mathematics_lb,16.04,[],livebench_240701
|
194 |
+
192,phi_3_mini_4k_instruct,mathematics_lb,19.88,[],livebench_240701
|
195 |
+
193,zephyr_7b_alpha,mathematics_lb,9.61,[],livebench_240701
|
196 |
+
194,phi_3_mini_128k_instruct,mathematics_lb,21.48,[],livebench_240701
|
197 |
+
195,zephyr_7b_beta,mathematics_lb,11.23,[],livebench_240701
|
198 |
+
196,deepseek_v2_lite_chat,mathematics_lb,11.99,[],livebench_240701
|
199 |
+
197,qwen1.5_7b_chat,mathematics_lb,12.86,[],livebench_240701
|
200 |
+
198,starling_lm_7b_beta,mathematics_lb,13.82,[],livebench_240701
|
201 |
+
199,vicuna_7b_v1.5_16k,mathematics_lb,6.61,[],livebench_240701
|
202 |
+
200,vicuna_7b_v1.5,mathematics_lb,4.33,[],livebench_240701
|
203 |
+
201,qwen1.5_4b_chat,mathematics_lb,7.08,[],livebench_240701
|
204 |
+
202,llama_2_7b_chat,mathematics_lb,4.78,[],livebench_240701
|
205 |
+
203,qwen2_1.5b_instruct,mathematics_lb,7.16,[],livebench_240701
|
206 |
+
204,yi_6b_chat,mathematics_lb,7.14,[],livebench_240701
|
207 |
+
205,qwen2_0.5b_instruct,mathematics_lb,4.22,[],livebench_240701
|
208 |
+
206,qwen1.5_1.8b_chat,mathematics_lb,2.14,[],livebench_240701
|
209 |
+
207,qwen1.5_0.5b_chat,mathematics_lb,3.39,[],livebench_240701
|
210 |
+
208,claude_3_5_sonnet_20240620,data_analysis_lb,56.74,[],livebench_240701
|
211 |
+
209,gpt_4o_2024_05_13,data_analysis_lb,52.41,[],livebench_240701
|
212 |
+
210,gpt_4_turbo_2024_04_09,data_analysis_lb,51.32,[],livebench_240701
|
213 |
+
211,gpt_4_1106_preview,data_analysis_lb,51.33,[],livebench_240701
|
214 |
+
212,claude_3_opus_20240229,data_analysis_lb,54.32,[],livebench_240701
|
215 |
+
213,gpt_4_0125_preview,data_analysis_lb,54.06,[],livebench_240701
|
216 |
+
214,deepseek_coder_v2,data_analysis_lb,38.25,[],livebench_240701
|
217 |
+
215,gemini_1.5_pro_api_0514,data_analysis_lb,52.81,[],livebench_240701
|
218 |
+
216,gemma_2_27b_it,data_analysis_lb,43.58,[],livebench_240701
|
219 |
+
217,gemini_1.5_flash_api_0514,data_analysis_lb,44.03,[],livebench_240701
|
220 |
+
218,qwen2_72b_instruct,data_analysis_lb,26.24,[],livebench_240701
|
221 |
+
219,acm_rewrite_qwen2_72b_chat,data_analysis_lb,26.19,[],livebench_240701
|
222 |
+
220,mistral_large_2402,data_analysis_lb,42.55,[],livebench_240701
|
223 |
+
221,deepseek_chat_v2,data_analysis_lb,38.03,[],livebench_240701
|
224 |
+
222,claude_3_sonnet_20240229,data_analysis_lb,44.56,[],livebench_240701
|
225 |
+
223,meta_llama_3_70b_instruct,data_analysis_lb,42.41,[],livebench_240701
|
226 |
+
224,claude_3_haiku_20240307,data_analysis_lb,41.54,[],livebench_240701
|
227 |
+
225,mixtral_8x22b_instruct_v0.1,data_analysis_lb,30.33,[],livebench_240701
|
228 |
+
226,gpt_3.5_turbo_0125,data_analysis_lb,41.21,[],livebench_240701
|
229 |
+
227,gpt_3.5_turbo_1106,data_analysis_lb,41.7,[],livebench_240701
|
230 |
+
228,command_r_plus,data_analysis_lb,24.6,[],livebench_240701
|
231 |
+
229,mistral_small_2402,data_analysis_lb,31.88,[],livebench_240701
|
232 |
+
230,gemma_2_9b_it,data_analysis_lb,35.06,[],livebench_240701
|
233 |
+
231,phi_3_medium_4k_instruct,data_analysis_lb,31.63,[],livebench_240701
|
234 |
+
232,phi_3_medium_128k_instruct,data_analysis_lb,32.12,[],livebench_240701
|
235 |
+
233,deepseek_coder_v2_lite_instruct,data_analysis_lb,33.0,[],livebench_240701
|
236 |
+
234,qwen1.5_110b_chat,data_analysis_lb,31.45,[],livebench_240701
|
237 |
+
235,qwen1.5_72b_chat,data_analysis_lb,32.98,[],livebench_240701
|
238 |
+
236,command_r,data_analysis_lb,31.69,[],livebench_240701
|
239 |
+
237,phi_3_small_128k_instruct,data_analysis_lb,27.33,[],livebench_240701
|
240 |
+
238,meta_llama_3_8b_instruct,data_analysis_lb,23.33,[],livebench_240701
|
241 |
+
239,qwen2_7b_instruct,data_analysis_lb,28.75,[],livebench_240701
|
242 |
+
240,phi_3_small_8k_instruct,data_analysis_lb,27.5,[],livebench_240701
|
243 |
+
241,openhermes_2.5_mistral_7b,data_analysis_lb,26.92,[],livebench_240701
|
244 |
+
242,mixtral_8x7b_instruct_v0.1,data_analysis_lb,28.13,[],livebench_240701
|
245 |
+
243,mistral_7b_instruct_v0.2,data_analysis_lb,14.62,[],livebench_240701
|
246 |
+
244,phi_3_mini_4k_instruct,data_analysis_lb,14.67,[],livebench_240701
|
247 |
+
245,zephyr_7b_alpha,data_analysis_lb,17.4,[],livebench_240701
|
248 |
+
246,phi_3_mini_128k_instruct,data_analysis_lb,8.69,[],livebench_240701
|
249 |
+
247,zephyr_7b_beta,data_analysis_lb,15.75,[],livebench_240701
|
250 |
+
248,deepseek_v2_lite_chat,data_analysis_lb,18.19,[],livebench_240701
|
251 |
+
249,qwen1.5_7b_chat,data_analysis_lb,16.23,[],livebench_240701
|
252 |
+
250,starling_lm_7b_beta,data_analysis_lb,2.0,[],livebench_240701
|
253 |
+
251,vicuna_7b_v1.5_16k,data_analysis_lb,9.27,[],livebench_240701
|
254 |
+
252,vicuna_7b_v1.5,data_analysis_lb,2.67,[],livebench_240701
|
255 |
+
253,qwen1.5_4b_chat,data_analysis_lb,9.13,[],livebench_240701
|
256 |
+
254,llama_2_7b_chat,data_analysis_lb,0.0,[],livebench_240701
|
257 |
+
255,qwen2_1.5b_instruct,data_analysis_lb,10.01,[],livebench_240701
|
258 |
+
256,yi_6b_chat,data_analysis_lb,4.38,[],livebench_240701
|
259 |
+
257,qwen2_0.5b_instruct,data_analysis_lb,2.0,[],livebench_240701
|
260 |
+
258,qwen1.5_1.8b_chat,data_analysis_lb,3.33,[],livebench_240701
|
261 |
+
259,qwen1.5_0.5b_chat,data_analysis_lb,0.0,[],livebench_240701
|
262 |
+
260,claude_3_5_sonnet_20240620,language_lb,56.94,[],livebench_240701
|
263 |
+
261,gpt_4o_2024_05_13,language_lb,53.94,[],livebench_240701
|
264 |
+
262,gpt_4_turbo_2024_04_09,language_lb,45.26,[],livebench_240701
|
265 |
+
263,gpt_4_1106_preview,language_lb,48.37,[],livebench_240701
|
266 |
+
264,claude_3_opus_20240229,language_lb,51.72,[],livebench_240701
|
267 |
+
265,gpt_4_0125_preview,language_lb,43.55,[],livebench_240701
|
268 |
+
266,deepseek_coder_v2,language_lb,33.04,[],livebench_240701
|
269 |
+
267,gemini_1.5_pro_api_0514,language_lb,38.25,[],livebench_240701
|
270 |
+
268,gemma_2_27b_it,language_lb,32.4,[],livebench_240701
|
271 |
+
269,gemini_1.5_flash_api_0514,language_lb,30.69,[],livebench_240701
|
272 |
+
270,qwen2_72b_instruct,language_lb,29.21,[],livebench_240701
|
273 |
+
271,acm_rewrite_qwen2_72b_chat,language_lb,30.03,[],livebench_240701
|
274 |
+
272,mistral_large_2402,language_lb,28.74,[],livebench_240701
|
275 |
+
273,deepseek_chat_v2,language_lb,32.29,[],livebench_240701
|
276 |
+
274,claude_3_sonnet_20240229,language_lb,38.08,[],livebench_240701
|
277 |
+
275,meta_llama_3_70b_instruct,language_lb,34.11,[],livebench_240701
|
278 |
+
276,claude_3_haiku_20240307,language_lb,30.07,[],livebench_240701
|
279 |
+
277,mixtral_8x22b_instruct_v0.1,language_lb,26.48,[],livebench_240701
|
280 |
+
278,gpt_3.5_turbo_0125,language_lb,24.22,[],livebench_240701
|
281 |
+
279,gpt_3.5_turbo_1106,language_lb,28.63,[],livebench_240701
|
282 |
+
280,command_r_plus,language_lb,23.92,[],livebench_240701
|
283 |
+
281,mistral_small_2402,language_lb,22.06,[],livebench_240701
|
284 |
+
282,gemma_2_9b_it,language_lb,27.64,[],livebench_240701
|
285 |
+
283,phi_3_medium_4k_instruct,language_lb,13.91,[],livebench_240701
|
286 |
+
284,phi_3_medium_128k_instruct,language_lb,12.76,[],livebench_240701
|
287 |
+
285,deepseek_coder_v2_lite_instruct,language_lb,10.64,[],livebench_240701
|
288 |
+
286,qwen1.5_110b_chat,language_lb,13.22,[],livebench_240701
|
289 |
+
287,qwen1.5_72b_chat,language_lb,11.37,[],livebench_240701
|
290 |
+
288,command_r,language_lb,14.64,[],livebench_240701
|
291 |
+
289,phi_3_small_128k_instruct,language_lb,12.28,[],livebench_240701
|
292 |
+
290,meta_llama_3_8b_instruct,language_lb,18.72,[],livebench_240701
|
293 |
+
291,qwen2_7b_instruct,language_lb,10.21,[],livebench_240701
|
294 |
+
292,phi_3_small_8k_instruct,language_lb,14.96,[],livebench_240701
|
295 |
+
293,openhermes_2.5_mistral_7b,language_lb,11.37,[],livebench_240701
|
296 |
+
294,mixtral_8x7b_instruct_v0.1,language_lb,13.76,[],livebench_240701
|
297 |
+
295,mistral_7b_instruct_v0.2,language_lb,9.05,[],livebench_240701
|
298 |
+
296,phi_3_mini_4k_instruct,language_lb,7.1,[],livebench_240701
|
299 |
+
297,zephyr_7b_alpha,language_lb,7.2,[],livebench_240701
|
300 |
+
298,phi_3_mini_128k_instruct,language_lb,6.8,[],livebench_240701
|
301 |
+
299,zephyr_7b_beta,language_lb,4.28,[],livebench_240701
|
302 |
+
300,deepseek_v2_lite_chat,language_lb,9.2,[],livebench_240701
|
303 |
+
301,qwen1.5_7b_chat,language_lb,6.18,[],livebench_240701
|
304 |
+
302,starling_lm_7b_beta,language_lb,7.26,[],livebench_240701
|
305 |
+
303,vicuna_7b_v1.5_16k,language_lb,7.92,[],livebench_240701
|
306 |
+
304,vicuna_7b_v1.5,language_lb,8.66,[],livebench_240701
|
307 |
+
305,qwen1.5_4b_chat,language_lb,5.8,[],livebench_240701
|
308 |
+
306,llama_2_7b_chat,language_lb,6.86,[],livebench_240701
|
309 |
+
307,qwen2_1.5b_instruct,language_lb,3.05,[],livebench_240701
|
310 |
+
308,yi_6b_chat,language_lb,4.69,[],livebench_240701
|
311 |
+
309,qwen2_0.5b_instruct,language_lb,2.8,[],livebench_240701
|
312 |
+
310,qwen1.5_1.8b_chat,language_lb,3.16,[],livebench_240701
|
313 |
+
311,qwen1.5_0.5b_chat,language_lb,2.88,[],livebench_240701
|
314 |
+
312,claude_3_5_sonnet_20240620,if_lb,72.3,[],livebench_240701
|
315 |
+
313,gpt_4o_2024_05_13,if_lb,72.17,[],livebench_240701
|
316 |
+
314,gpt_4_turbo_2024_04_09,if_lb,71.39,[],livebench_240701
|
317 |
+
315,gpt_4_1106_preview,if_lb,69.39,[],livebench_240701
|
318 |
+
316,claude_3_opus_20240229,if_lb,70.87,[],livebench_240701
|
319 |
+
317,gpt_4_0125_preview,if_lb,63.92,[],livebench_240701
|
320 |
+
318,deepseek_coder_v2,if_lb,67.18,[],livebench_240701
|
321 |
+
319,gemini_1.5_pro_api_0514,if_lb,67.2,[],livebench_240701
|
322 |
+
320,gemma_2_27b_it,if_lb,67.37,[],livebench_240701
|
323 |
+
321,gemini_1.5_flash_api_0514,if_lb,63.01,[],livebench_240701
|
324 |
+
322,qwen2_72b_instruct,if_lb,68.27,[],livebench_240701
|
325 |
+
323,acm_rewrite_qwen2_72b_chat,if_lb,65.0,[],livebench_240701
|
326 |
+
324,mistral_large_2402,if_lb,68.19,[],livebench_240701
|
327 |
+
325,deepseek_chat_v2,if_lb,64.34,[],livebench_240701
|
328 |
+
326,claude_3_sonnet_20240229,if_lb,65.0,[],livebench_240701
|
329 |
+
327,meta_llama_3_70b_instruct,if_lb,63.5,[],livebench_240701
|
330 |
+
328,claude_3_haiku_20240307,if_lb,64.03,[],livebench_240701
|
331 |
+
329,mixtral_8x22b_instruct_v0.1,if_lb,63.17,[],livebench_240701
|
332 |
+
330,gpt_3.5_turbo_0125,if_lb,60.47,[],livebench_240701
|
333 |
+
331,gpt_3.5_turbo_1106,if_lb,51.53,[],livebench_240701
|
334 |
+
332,command_r_plus,if_lb,71.51,[],livebench_240701
|
335 |
+
333,mistral_small_2402,if_lb,63.91,[],livebench_240701
|
336 |
+
334,gemma_2_9b_it,if_lb,61.55,[],livebench_240701
|
337 |
+
335,phi_3_medium_4k_instruct,if_lb,53.3,[],livebench_240701
|
338 |
+
336,phi_3_medium_128k_instruct,if_lb,56.15,[],livebench_240701
|
339 |
+
337,deepseek_coder_v2_lite_instruct,if_lb,48.34,[],livebench_240701
|
340 |
+
338,qwen1.5_110b_chat,if_lb,55.26,[],livebench_240701
|
341 |
+
339,qwen1.5_72b_chat,if_lb,58.25,[],livebench_240701
|
342 |
+
340,command_r,if_lb,57.16,[],livebench_240701
|
343 |
+
341,phi_3_small_128k_instruct,if_lb,36.88,[],livebench_240701
|
344 |
+
342,meta_llama_3_8b_instruct,if_lb,57.14,[],livebench_240701
|
345 |
+
343,qwen2_7b_instruct,if_lb,44.74,[],livebench_240701
|
346 |
+
344,phi_3_small_8k_instruct,if_lb,48.24,[],livebench_240701
|
347 |
+
345,openhermes_2.5_mistral_7b,if_lb,52.78,[],livebench_240701
|
348 |
+
346,mixtral_8x7b_instruct_v0.1,if_lb,44.81,[],livebench_240701
|
349 |
+
347,mistral_7b_instruct_v0.2,if_lb,51.65,[],livebench_240701
|
350 |
+
348,phi_3_mini_4k_instruct,if_lb,40.05,[],livebench_240701
|
351 |
+
349,zephyr_7b_alpha,if_lb,52.79,[],livebench_240701
|
352 |
+
350,phi_3_mini_128k_instruct,if_lb,49.65,[],livebench_240701
|
353 |
+
351,zephyr_7b_beta,if_lb,48.32,[],livebench_240701
|
354 |
+
352,deepseek_v2_lite_chat,if_lb,41.83,[],livebench_240701
|
355 |
+
353,qwen1.5_7b_chat,if_lb,44.12,[],livebench_240701
|
356 |
+
354,starling_lm_7b_beta,if_lb,38.32,[],livebench_240701
|
357 |
+
355,vicuna_7b_v1.5_16k,if_lb,42.12,[],livebench_240701
|
358 |
+
356,vicuna_7b_v1.5,if_lb,41.75,[],livebench_240701
|
359 |
+
357,qwen1.5_4b_chat,if_lb,27.75,[],livebench_240701
|
360 |
+
358,llama_2_7b_chat,if_lb,44.88,[],livebench_240701
|
361 |
+
359,qwen2_1.5b_instruct,if_lb,25.9,[],livebench_240701
|
362 |
+
360,yi_6b_chat,if_lb,27.22,[],livebench_240701
|
363 |
+
361,qwen2_0.5b_instruct,if_lb,26.63,[],livebench_240701
|
364 |
+
362,qwen1.5_1.8b_chat,if_lb,22.9,[],livebench_240701
|
365 |
+
363,qwen1.5_0.5b_chat,if_lb,21.3,[],livebench_240701
|