Yotam Perlitz commited on
Commit
0f8e886
β€’
1 Parent(s): ecb1e20

Signed-off-by: Yotam Perlitz <yotam.perlitz@ibm.com>

app.py CHANGED
@@ -3,6 +3,63 @@ import pandas as pd
3
 
4
  st.title("β€Žβ€Žβ€Ž β€Žβ€Ž β€Ž β€Ž β€Ž β€Ž β€Ž β€ŽπŸ‹οΈβ€β™‚οΈ benchbench-Leaderboard πŸ‹οΈβ€β™‚οΈ")
5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  # df = pd.read_csv("BAT_w_arena_10_random.csv")
7
  # df = (
8
  # (
 
3
 
4
  st.title("β€Žβ€Žβ€Ž β€Žβ€Ž β€Ž β€Ž β€Ž β€Ž β€Ž β€ŽπŸ‹οΈβ€β™‚οΈ benchbench-Leaderboard πŸ‹οΈβ€β™‚οΈ")
5
 
6
+ import pandas as pd
7
+ from bat import Tester, Config, Benchmark, Reporter
8
+ from bat.utils import get_holistic_benchmark
9
+
10
+
11
+ cfg = Config(
12
+ exp_to_run="example",
13
+ n_models_taken_list=[0],
14
+ model_select_strategy_list=["random"],
15
+ n_exps=10,
16
+ # reference_data_path="data/combined_holistic.csv",
17
+ )
18
+
19
+
20
+ newbench_name = "livebench"
21
+ new_bench_agg_name = f"{newbench_name}_mwr"
22
+
23
+ tester = Tester(cfg=cfg)
24
+
25
+ # models_for_benchmark_scoring = tester.fetch_reference_models_names(
26
+ # reference_benchmark=get_holistic_benchmark(), n_models=20
27
+ # )
28
+
29
+ newbench = Benchmark(
30
+ pd.read_csv(f"assets/{newbench_name}.csv"),
31
+ data_source=newbench_name,
32
+ )
33
+
34
+ # newbench.add_aggragete(new_col_name=new_bench_agg_name)
35
+ # newbench_agreements = tester.all_vs_all_agreement_testing(newbench)
36
+
37
+ reporter = Reporter()
38
+ # reporter.draw_agreements(
39
+ # newbench_agreements, ref_sources=[newbench_name], scenario_sources=[newbench_name]
40
+ # )
41
+
42
+ holistic = get_holistic_benchmark()
43
+ holistic.add_aggragete(new_col_name="aggregate", agg_source_name="holistic")
44
+
45
+ allbench = newbench.extend(holistic)
46
+ allbench.clear_repeated_scenarios(source_to_keep=newbench_name)
47
+
48
+
49
+ @st.cache_data
50
+ def run_load():
51
+ return tester.all_vs_all_agreement_testing(allbench)
52
+
53
+
54
+ all_agreements = run_load()
55
+
56
+ observed_scenario = "arena_elo" # "livebench_lb"
57
+ blacklist_sources = [] # "livebench"
58
+
59
+ z_score = reporter.get_z_score(all_agreements, observed_scenario, blacklist_sources)
60
+
61
+ st.write(f"zscore of {observed_scenario}: {z_score}")
62
+
63
  # df = pd.read_csv("BAT_w_arena_10_random.csv")
64
  # df = (
65
  # (
assets/combined_20240704.csv ADDED
The diff for this file is too large to render. See raw diff
 
assets/combined_holistic.csv ADDED
@@ -0,0 +1,825 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ,model,score,scenario,source,aggragated_from
2
+ 0,gpt-4-turbo-2024-04-09,82.6,arena-hard,arena_hard_2404,[]
3
+ 1,gpt-4-0125-preview,78.0,arena-hard,arena_hard_2404,[]
4
+ 2,gemini-1.5-pro-api-preview,72.0,arena-hard,arena_hard_2404,[]
5
+ 3,yi-large,63.7,arena-hard,arena_hard_2404,[]
6
+ 4,claude-3-opus-20240229,60.4,arena-hard,arena_hard_2404,[]
7
+ 5,glm-4,55.7,arena-hard,arena_hard_2404,[]
8
+ 6,gpt-4-0314,50.0,arena-hard,arena_hard_2404,[]
9
+ 7,gemini-1.5-flash-api-preview,49.6,arena-hard,arena_hard_2404,[]
10
+ 8,claude-3-sonnet-20240229,46.8,arena-hard,arena_hard_2404,[]
11
+ 9,claude-3-haiku-20240307,41.5,arena-hard,arena_hard_2404,[]
12
+ 10,llama-3-70b-chat-hf,41.1,arena-hard,arena_hard_2404,[]
13
+ 11,gpt-4-0613,37.9,arena-hard,arena_hard_2404,[]
14
+ 12,mistral-large-2402,37.7,arena-hard,arena_hard_2404,[]
15
+ 13,mixtral-8x22b-instruct-v0.1,36.4,arena-hard,arena_hard_2404,[]
16
+ 14,qwen1.5-72b-chat,36.1,arena-hard,arena_hard_2404,[]
17
+ 15,command-r-plus,33.1,arena-hard,arena_hard_2404,[]
18
+ 16,mistral-medium,31.9,arena-hard,arena_hard_2404,[]
19
+ 17,mistral-next,27.4,arena-hard,arena_hard_2404,[]
20
+ 18,gpt-3.5-turbo-0613,24.8,arena-hard,arena_hard_2404,[]
21
+ 19,claude-2.0,24.0,arena-hard,arena_hard_2404,[]
22
+ 20,dbrx-instructruct,23.9,arena-hard,arena_hard_2404,[]
23
+ 21,mixtral-8x7b-instruct-v0.1,23.4,arena-hard,arena_hard_2404,[]
24
+ 22,gpt-3.5-turbo-0125,23.3,arena-hard,arena_hard_2404,[]
25
+ 23,yi-34b-chat,23.1,arena-hard,arena_hard_2404,[]
26
+ 24,starling-lm-7b-beta,23.0,arena-hard,arena_hard_2404,[]
27
+ 25,claude-2.1,22.8,arena-hard,arena_hard_2404,[]
28
+ 26,snorkel-mistral-pairrm-dpo,20.7,arena-hard,arena_hard_2404,[]
29
+ 27,llama-3-8b-chat-hf,20.6,arena-hard,arena_hard_2404,[]
30
+ 28,gpt-3.5-turbo-1106,18.9,arena-hard,arena_hard_2404,[]
31
+ 29,gpt-3.5-turbo-0301,18.1,arena-hard,arena_hard_2404,[]
32
+ 30,gemini-1.0-pro,17.8,arena-hard,arena_hard_2404,[]
33
+ 31,snowflake-arctic-instruct,17.6,arena-hard,arena_hard_2404,[]
34
+ 32,command-r,17.0,arena-hard,arena_hard_2404,[]
35
+ 33,phi-3-mini-128k-instruct,15.4,arena-hard,arena_hard_2404,[]
36
+ 34,tulu-2-dpo-70b,15.0,arena-hard,arena_hard_2404,[]
37
+ 35,starling-lm-7b-alpha,12.8,arena-hard,arena_hard_2404,[]
38
+ 36,mistral-7b-instruct,12.6,arena-hard,arena_hard_2404,[]
39
+ 37,gemma-1.1-7b-it,12.1,arena-hard,arena_hard_2404,[]
40
+ 38,llama-2-70b-chat-hf,11.6,arena-hard,arena_hard_2404,[]
41
+ 39,vicuna-33b-v1.3,8.6,arena-hard,arena_hard_2404,[]
42
+ 40,gemma-7b-it,7.5,arena-hard,arena_hard_2404,[]
43
+ 41,llama-2-7b-chat-hf,4.6,arena-hard,arena_hard_2404,[]
44
+ 42,gemma-1.1-2b-it,3.4,arena-hard,arena_hard_2404,[]
45
+ 43,gemma-2b-it,3.0,arena-hard,arena_hard_2404,[]
46
+ 0,gpt-4o-2024-05-13,64.7,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
47
+ 1,claude-3-opus,63.5,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
48
+ 2,gpt-4-turbo-2024-04-09,62.6,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
49
+ 3,gemini-1.5-pro-api-0409,58.7,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
50
+ 4,yi-large-preview,56.8,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
51
+ 5,llama-3-70b-instruct,55.9,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
52
+ 6,qwen-max-0428,55.8,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
53
+ 7,claude-3-sonnet,54.0,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
54
+ 8,reka-core-20240415,52.9,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
55
+ 9,mammoth2-8x7b-plus,51.8,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
56
+ 10,deepseek-v2,51.7,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
57
+ 11,command-r-plus,51.4,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
58
+ 12,yi-1.5-34b-chat,51.2,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
59
+ 13,mistral-large,50.3,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
60
+ 14,qwen1.5-72b-chat,48.3,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
61
+ 15,mistral-medium,47.8,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
62
+ 16,gemini-1.0-pro,46.4,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
63
+ 17,reka-flash-20240226,46.2,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
64
+ 18,mistral-small,46.2,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
65
+ 19,llama-3-8b-instruct,45.6,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
66
+ 20,command-r,45.2,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
67
+ 21,qwen1.5-32b-chat,43.3,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
68
+ 22,gpt-3.5-turbo-0125,43.0,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
69
+ 23,claude-3-haiku,42.8,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
70
+ 24,yi-34b-chat,42.6,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
71
+ 25,mixtral-8x7b-instruct-v0.1,42.5,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
72
+ 26,starling-lm-7b-beta,41.8,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
73
+ 27,yi-1.5-9b-chat,40.9,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
74
+ 28,gemma-1.1-7b-it,39.1,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
75
+ 29,vicuna-33b-v1.3,38.7,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
76
+ 30,llama-2-70b-chat,38.0,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
77
+ 31,map-neo-instruct-v0.1,37.8,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
78
+ 32,mistral-7b-instruct-v0.2,36.2,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
79
+ 33,qwen1.5-7b-chat,35.5,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
80
+ 34,reka-edge-20240208,32.2,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
81
+ 35,zephyr-7b-beta,31.6,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
82
+ 36,llama-2-7b-chat,30.8,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
83
+ 37,yi-6b-chat,30.1,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
84
+ 38,qwen1.5-moe-a2.7b-chat,29.1,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
85
+ 39,gemma-1.1-2b-it,28.4,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
86
+ 40,vicuna-7b-v1.5,27.8,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
87
+ 41,olmo-7b-instruct,26.7,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
88
+ 42,qwen1.5-4b-chat,24.6,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
89
+ 43,jetmoe-8b-chat,24.3,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
90
+ 44,mpt-7b-chat,23.8,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
91
+ 45,llama-3-70b,54.0,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
92
+ 46,qwen1.5-72b,41.9,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
93
+ 47,yi-34b,47.2,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
94
+ 48,qwen1.5-32b,41.0,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
95
+ 49,mixtral-8x7b,40.7,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
96
+ 50,llama-2-70b,41.6,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
97
+ 51,qwen1.5-moe-a2.7b,33.5,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
98
+ 52,qwen1.5-7b,33.7,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
99
+ 53,llama-3-8b,31.7,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
100
+ 54,mistral-7b,27.1,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
101
+ 55,gemma-7b,32.7,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
102
+ 56,yi-6b,30.4,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
103
+ 57,qwen1.5-4b,23.5,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
104
+ 58,jetmoe-8b,27.0,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
105
+ 59,deepseek-7b,21.7,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
106
+ 60,phi-2,21.9,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
107
+ 61,deepseekmoe-16b,24.2,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
108
+ 62,llama-2-7b,22.1,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
109
+ 63,gemma-2b,22.6,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
110
+ 64,olmo-7b,21.2,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
111
+ 65,mpt-7b,17.4,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
112
+ 66,gpt-4o-2024-05-13,87.9,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
113
+ 67,claude-3-opus,88.1,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
114
+ 68,gpt-4-turbo-2024-04-09,88.8,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
115
+ 69,gemini-1.5-pro-api-0409,84.2,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
116
+ 70,yi-large-preview,84.4,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
117
+ 71,llama-3-70b-instruct,84.0,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
118
+ 72,qwen-max-0428,86.1,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
119
+ 73,claude-3-sonnet,81.7,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
120
+ 74,reka-core-20240415,83.3,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
121
+ 75,mammoth2-8x7b-plus,81.5,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
122
+ 76,deepseek-v2,83.7,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
123
+ 77,command-r-plus,81.5,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
124
+ 78,yi-1.5-34b-chat,81.7,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
125
+ 79,mistral-large,84.2,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
126
+ 80,qwen1.5-72b-chat,84.1,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
127
+ 81,mistral-medium,81.9,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
128
+ 82,gemini-1.0-pro,78.9,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
129
+ 83,reka-flash-20240226,79.8,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
130
+ 84,mistral-small,81.2,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
131
+ 85,llama-3-8b-instruct,75.0,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
132
+ 86,command-r,77.0,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
133
+ 87,qwen1.5-32b-chat,81.0,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
134
+ 88,gpt-3.5-turbo-0125,79.7,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
135
+ 89,claude-3-haiku,79.7,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
136
+ 90,yi-34b-chat,80.1,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
137
+ 91,mixtral-8x7b-instruct-v0.1,76.4,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
138
+ 92,starling-lm-7b-beta,74.8,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
139
+ 93,yi-1.5-9b-chat,74.2,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
140
+ 94,gemma-1.1-7b-it,69.6,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
141
+ 95,vicuna-33b-v1.3,66.3,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
142
+ 96,llama-2-70b-chat,74.6,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
143
+ 97,map-neo-instruct-v0.1,70.0,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
144
+ 98,mistral-7b-instruct-v0.2,70.0,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
145
+ 99,qwen1.5-7b-chat,71.4,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
146
+ 100,reka-edge-20240208,68.5,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
147
+ 101,zephyr-7b-beta,69.1,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
148
+ 102,llama-2-7b-chat,61.7,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
149
+ 103,yi-6b-chat,65.6,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
150
+ 104,qwen1.5-moe-a2.7b-chat,69.1,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
151
+ 105,gemma-1.1-2b-it,51.9,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
152
+ 106,vicuna-7b-v1.5,60.3,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
153
+ 107,olmo-7b-instruct,55.0,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
154
+ 108,qwen1.5-4b-chat,57.2,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
155
+ 109,jetmoe-8b-chat,51.6,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
156
+ 110,mpt-7b-chat,43.8,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
157
+ 111,llama-3-70b,82.2,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
158
+ 112,qwen1.5-72b,79.5,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
159
+ 113,yi-34b,78.3,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
160
+ 114,qwen1.5-32b,77.6,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
161
+ 115,mixtral-8x7b,74.0,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
162
+ 116,llama-2-70b,73.2,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
163
+ 117,qwen1.5-moe-a2.7b,70.2,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
164
+ 118,qwen1.5-7b,68.2,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
165
+ 119,llama-3-8b,65.1,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
166
+ 120,mistral-7b,64.8,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
167
+ 121,gemma-7b,64.7,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
168
+ 122,yi-6b,63.1,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
169
+ 123,qwen1.5-4b,58.2,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
170
+ 124,jetmoe-8b,57.1,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
171
+ 125,deepseek-7b,52.2,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
172
+ 126,phi-2,51.9,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
173
+ 127,deepseekmoe-16b,51.4,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
174
+ 128,llama-2-7b,43.1,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
175
+ 129,gemma-2b,38.9,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
176
+ 130,olmo-7b,31.8,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
177
+ 131,mpt-7b,30.8,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
178
+ 264,gpt-4o-2024-05-13,85.4,mmlu-mixed,mixeval_240601,[]
179
+ 265,claude-3-opus,83.2,mmlu-mixed,mixeval_240601,[]
180
+ 266,gpt-4-turbo-2024-04-09,82.8,mmlu-mixed,mixeval_240601,[]
181
+ 267,gemini-1.5-pro-api-0409,79.2,mmlu-mixed,mixeval_240601,[]
182
+ 268,yi-large-preview,80.9,mmlu-mixed,mixeval_240601,[]
183
+ 269,llama-3-70b-instruct,80.5,mmlu-mixed,mixeval_240601,[]
184
+ 270,qwen-max-0428,80.6,mmlu-mixed,mixeval_240601,[]
185
+ 271,claude-3-sonnet,74.7,mmlu-mixed,mixeval_240601,[]
186
+ 272,reka-core-20240415,79.3,mmlu-mixed,mixeval_240601,[]
187
+ 273,mammoth2-8x7b-plus,74.5,mmlu-mixed,mixeval_240601,[]
188
+ 274,deepseek-v2,77.3,mmlu-mixed,mixeval_240601,[]
189
+ 275,command-r-plus,78.9,mmlu-mixed,mixeval_240601,[]
190
+ 276,yi-1.5-34b-chat,76.4,mmlu-mixed,mixeval_240601,[]
191
+ 277,mistral-large,80.2,mmlu-mixed,mixeval_240601,[]
192
+ 278,qwen1.5-72b-chat,80.1,mmlu-mixed,mixeval_240601,[]
193
+ 279,mistral-medium,76.3,mmlu-mixed,mixeval_240601,[]
194
+ 280,gemini-1.0-pro,74.9,mmlu-mixed,mixeval_240601,[]
195
+ 281,reka-flash-20240226,75.4,mmlu-mixed,mixeval_240601,[]
196
+ 282,mistral-small,75.2,mmlu-mixed,mixeval_240601,[]
197
+ 283,llama-3-8b-instruct,71.9,mmlu-mixed,mixeval_240601,[]
198
+ 284,command-r,75.0,mmlu-mixed,mixeval_240601,[]
199
+ 285,qwen1.5-32b-chat,78.0,mmlu-mixed,mixeval_240601,[]
200
+ 286,gpt-3.5-turbo-0125,74.5,mmlu-mixed,mixeval_240601,[]
201
+ 287,claude-3-haiku,76.1,mmlu-mixed,mixeval_240601,[]
202
+ 288,yi-34b-chat,73.6,mmlu-mixed,mixeval_240601,[]
203
+ 289,mixtral-8x7b-instruct-v0.1,72.0,mmlu-mixed,mixeval_240601,[]
204
+ 290,starling-lm-7b-beta,69.0,mmlu-mixed,mixeval_240601,[]
205
+ 291,yi-1.5-9b-chat,72.6,mmlu-mixed,mixeval_240601,[]
206
+ 292,gemma-1.1-7b-it,66.9,mmlu-mixed,mixeval_240601,[]
207
+ 293,vicuna-33b-v1.3,59.2,mmlu-mixed,mixeval_240601,[]
208
+ 294,llama-2-70b-chat,69.8,mmlu-mixed,mixeval_240601,[]
209
+ 295,map-neo-instruct-v0.1,66.7,mmlu-mixed,mixeval_240601,[]
210
+ 296,mistral-7b-instruct-v0.2,67.3,mmlu-mixed,mixeval_240601,[]
211
+ 297,qwen1.5-7b-chat,68.7,mmlu-mixed,mixeval_240601,[]
212
+ 298,reka-edge-20240208,63.6,mmlu-mixed,mixeval_240601,[]
213
+ 299,zephyr-7b-beta,64.9,mmlu-mixed,mixeval_240601,[]
214
+ 300,llama-2-7b-chat,59.4,mmlu-mixed,mixeval_240601,[]
215
+ 301,yi-6b-chat,65.4,mmlu-mixed,mixeval_240601,[]
216
+ 302,qwen1.5-moe-a2.7b-chat,69.5,mmlu-mixed,mixeval_240601,[]
217
+ 303,gemma-1.1-2b-it,51.5,mmlu-mixed,mixeval_240601,[]
218
+ 304,vicuna-7b-v1.5,58.7,mmlu-mixed,mixeval_240601,[]
219
+ 305,olmo-7b-instruct,57.1,mmlu-mixed,mixeval_240601,[]
220
+ 306,qwen1.5-4b-chat,61.4,mmlu-mixed,mixeval_240601,[]
221
+ 307,jetmoe-8b-chat,58.5,mmlu-mixed,mixeval_240601,[]
222
+ 308,mpt-7b-chat,37.8,mmlu-mixed,mixeval_240601,[]
223
+ 309,llama-3-70b,79.8,mmlu-mixed,mixeval_240601,[]
224
+ 310,qwen1.5-72b,78.8,mmlu-mixed,mixeval_240601,[]
225
+ 311,yi-34b,79.3,mmlu-mixed,mixeval_240601,[]
226
+ 312,qwen1.5-32b,77.2,mmlu-mixed,mixeval_240601,[]
227
+ 313,mixtral-8x7b,71.6,mmlu-mixed,mixeval_240601,[]
228
+ 314,llama-2-70b,70.8,mmlu-mixed,mixeval_240601,[]
229
+ 315,qwen1.5-moe-a2.7b,69.4,mmlu-mixed,mixeval_240601,[]
230
+ 316,qwen1.5-7b,67.0,mmlu-mixed,mixeval_240601,[]
231
+ 317,llama-3-8b,69.5,mmlu-mixed,mixeval_240601,[]
232
+ 318,mistral-7b,68.5,mmlu-mixed,mixeval_240601,[]
233
+ 319,gemma-7b,67.4,mmlu-mixed,mixeval_240601,[]
234
+ 320,yi-6b,71.2,mmlu-mixed,mixeval_240601,[]
235
+ 321,qwen1.5-4b,59.6,mmlu-mixed,mixeval_240601,[]
236
+ 322,jetmoe-8b,55.3,mmlu-mixed,mixeval_240601,[]
237
+ 323,deepseek-7b,53.3,mmlu-mixed,mixeval_240601,[]
238
+ 324,phi-2,62.5,mmlu-mixed,mixeval_240601,[]
239
+ 325,deepseekmoe-16b,49.9,mmlu-mixed,mixeval_240601,[]
240
+ 326,llama-2-7b,40.8,mmlu-mixed,mixeval_240601,[]
241
+ 327,gemma-2b,37.4,mmlu-mixed,mixeval_240601,[]
242
+ 328,olmo-7b,29.7,mmlu-mixed,mixeval_240601,[]
243
+ 329,mpt-7b,30.9,mmlu-mixed,mixeval_240601,[]
244
+ 594,gpt-4o-2024-05-13,57.1,mmlu-hard-mixed,mixeval_240601,[]
245
+ 595,claude-3-opus,55.0,mmlu-hard-mixed,mixeval_240601,[]
246
+ 596,gpt-4-turbo-2024-04-09,45.5,mmlu-hard-mixed,mixeval_240601,[]
247
+ 597,gemini-1.5-pro-api-0409,44.6,mmlu-hard-mixed,mixeval_240601,[]
248
+ 598,yi-large-preview,48.5,mmlu-hard-mixed,mixeval_240601,[]
249
+ 599,llama-3-70b-instruct,46.3,mmlu-hard-mixed,mixeval_240601,[]
250
+ 600,qwen-max-0428,41.6,mmlu-hard-mixed,mixeval_240601,[]
251
+ 601,claude-3-sonnet,40.7,mmlu-hard-mixed,mixeval_240601,[]
252
+ 602,reka-core-20240415,46.3,mmlu-hard-mixed,mixeval_240601,[]
253
+ 603,mammoth2-8x7b-plus,41.1,mmlu-hard-mixed,mixeval_240601,[]
254
+ 604,deepseek-v2,42.0,mmlu-hard-mixed,mixeval_240601,[]
255
+ 605,command-r-plus,42.0,mmlu-hard-mixed,mixeval_240601,[]
256
+ 606,yi-1.5-34b-chat,38.1,mmlu-hard-mixed,mixeval_240601,[]
257
+ 607,mistral-large,42.4,mmlu-hard-mixed,mixeval_240601,[]
258
+ 608,qwen1.5-72b-chat,37.7,mmlu-hard-mixed,mixeval_240601,[]
259
+ 609,mistral-medium,38.5,mmlu-hard-mixed,mixeval_240601,[]
260
+ 610,gemini-1.0-pro,35.5,mmlu-hard-mixed,mixeval_240601,[]
261
+ 611,reka-flash-20240226,34.6,mmlu-hard-mixed,mixeval_240601,[]
262
+ 612,mistral-small,33.8,mmlu-hard-mixed,mixeval_240601,[]
263
+ 613,llama-3-8b-instruct,40.7,mmlu-hard-mixed,mixeval_240601,[]
264
+ 614,command-r,39.0,mmlu-hard-mixed,mixeval_240601,[]
265
+ 615,qwen1.5-32b-chat,29.9,mmlu-hard-mixed,mixeval_240601,[]
266
+ 616,gpt-3.5-turbo-0125,35.1,mmlu-hard-mixed,mixeval_240601,[]
267
+ 617,claude-3-haiku,30.7,mmlu-hard-mixed,mixeval_240601,[]
268
+ 618,yi-34b-chat,29.9,mmlu-hard-mixed,mixeval_240601,[]
269
+ 619,mixtral-8x7b-instruct-v0.1,37.2,mmlu-hard-mixed,mixeval_240601,[]
270
+ 620,starling-lm-7b-beta,34.2,mmlu-hard-mixed,mixeval_240601,[]
271
+ 621,yi-1.5-9b-chat,36.8,mmlu-hard-mixed,mixeval_240601,[]
272
+ 622,gemma-1.1-7b-it,39.0,mmlu-hard-mixed,mixeval_240601,[]
273
+ 623,vicuna-33b-v1.3,39.4,mmlu-hard-mixed,mixeval_240601,[]
274
+ 624,llama-2-70b-chat,27.7,mmlu-hard-mixed,mixeval_240601,[]
275
+ 625,map-neo-instruct-v0.1,32.5,mmlu-hard-mixed,mixeval_240601,[]
276
+ 626,mistral-7b-instruct-v0.2,29.4,mmlu-hard-mixed,mixeval_240601,[]
277
+ 627,qwen1.5-7b-chat,29.0,mmlu-hard-mixed,mixeval_240601,[]
278
+ 628,reka-edge-20240208,26.4,mmlu-hard-mixed,mixeval_240601,[]
279
+ 629,zephyr-7b-beta,24.2,mmlu-hard-mixed,mixeval_240601,[]
280
+ 630,llama-2-7b-chat,30.3,mmlu-hard-mixed,mixeval_240601,[]
281
+ 631,yi-6b-chat,26.8,mmlu-hard-mixed,mixeval_240601,[]
282
+ 632,qwen1.5-moe-a2.7b-chat,26.8,mmlu-hard-mixed,mixeval_240601,[]
283
+ 633,gemma-1.1-2b-it,30.3,mmlu-hard-mixed,mixeval_240601,[]
284
+ 634,vicuna-7b-v1.5,23.4,mmlu-hard-mixed,mixeval_240601,[]
285
+ 635,olmo-7b-instruct,27.3,mmlu-hard-mixed,mixeval_240601,[]
286
+ 636,qwen1.5-4b-chat,17.3,mmlu-hard-mixed,mixeval_240601,[]
287
+ 637,jetmoe-8b-chat,25.5,mmlu-hard-mixed,mixeval_240601,[]
288
+ 638,mpt-7b-chat,24.7,mmlu-hard-mixed,mixeval_240601,[]
289
+ 639,llama-3-70b,39.8,mmlu-hard-mixed,mixeval_240601,[]
290
+ 640,qwen1.5-72b,42.4,mmlu-hard-mixed,mixeval_240601,[]
291
+ 641,yi-34b,42.4,mmlu-hard-mixed,mixeval_240601,[]
292
+ 642,qwen1.5-32b,37.2,mmlu-hard-mixed,mixeval_240601,[]
293
+ 643,mixtral-8x7b,34.6,mmlu-hard-mixed,mixeval_240601,[]
294
+ 644,llama-2-70b,29.0,mmlu-hard-mixed,mixeval_240601,[]
295
+ 645,qwen1.5-moe-a2.7b,30.7,mmlu-hard-mixed,mixeval_240601,[]
296
+ 646,qwen1.5-7b,28.6,mmlu-hard-mixed,mixeval_240601,[]
297
+ 647,llama-3-8b,38.5,mmlu-hard-mixed,mixeval_240601,[]
298
+ 648,mistral-7b,27.7,mmlu-hard-mixed,mixeval_240601,[]
299
+ 649,gemma-7b,28.1,mmlu-hard-mixed,mixeval_240601,[]
300
+ 650,yi-6b,37.2,mmlu-hard-mixed,mixeval_240601,[]
301
+ 651,qwen1.5-4b,22.9,mmlu-hard-mixed,mixeval_240601,[]
302
+ 652,jetmoe-8b,27.3,mmlu-hard-mixed,mixeval_240601,[]
303
+ 653,deepseek-7b,26.4,mmlu-hard-mixed,mixeval_240601,[]
304
+ 654,phi-2,29.0,mmlu-hard-mixed,mixeval_240601,[]
305
+ 655,deepseekmoe-16b,30.7,mmlu-hard-mixed,mixeval_240601,[]
306
+ 656,llama-2-7b,24.7,mmlu-hard-mixed,mixeval_240601,[]
307
+ 657,gemma-2b,27.3,mmlu-hard-mixed,mixeval_240601,[]
308
+ 658,olmo-7b,25.1,mmlu-hard-mixed,mixeval_240601,[]
309
+ 659,mpt-7b,24.2,mmlu-hard-mixed,mixeval_240601,[]
310
+ 593,gpt-4-0314,0.57,agieval,BLZ_240312,[]
311
+ 594,gpt-4-0613,0.57,agieval,BLZ_240312,[]
312
+ 596,claude-1,0.49700000000000005,agieval,BLZ_240312,[]
313
+ 601,mixtral-8x7b-instruct-v0.1,0.45299999999999996,agieval,BLZ_240312,[]
314
+ 602,yi-34b-chat,0.508,agieval,BLZ_240312,[]
315
+ 605,gpt-3.5-turbo-0314,0.43200000000000005,agieval,BLZ_240312,[]
316
+ 608,vicuna-33b,0.373,agieval,BLZ_240312,[]
317
+ 609,starling-lm-7b-alpha,0.401,agieval,BLZ_240312,[]
318
+ 611,llama-2-70b-chat,0.45,agieval,BLZ_240312,[]
319
+ 613,openhermes-2.5-mistral-7b,0.43,agieval,BLZ_240312,[]
320
+ 614,openchat-3.5,0.42700000000000005,agieval,BLZ_240312,[]
321
+ 617,solar-10.7b-instruct-v1.0,0.47600000000000003,agieval,BLZ_240312,[]
322
+ 618,dolphin-2.2.1-mistral-7b,0.392,agieval,BLZ_240312,[]
323
+ 620,zephyr-7b-beta,0.406,agieval,BLZ_240312,[]
324
+ 623,llama-2-13b-chat,0.336,agieval,BLZ_240312,[]
325
+ 624,vicuna-13b,0.368,agieval,BLZ_240312,[]
326
+ 626,zephyr-7b-alpha,0.38,agieval,BLZ_240312,[]
327
+ 627,qwen-14b-chat,0.396,agieval,BLZ_240312,[]
328
+ 630,llama-2-7b-chat,0.29600000000000004,agieval,BLZ_240312,[]
329
+ 632,mistral-7b-instruct-v0.1,0.335,agieval,BLZ_240312,[]
330
+ 634,vicuna-7b,0.314,agieval,BLZ_240312,[]
331
+ 636,chatglm3-6b,0.414,agieval,BLZ_240312,[]
332
+ 643,chatglm-6b,0.325,agieval,BLZ_240312,[]
333
+ 647,llama-13b,0.205,agieval,BLZ_240312,[]
334
+ 886,gpt-4-1106-preview,0.977,alpacav1,BLZ_240312,[]
335
+ 888,gpt-4-0314,0.9528,alpacav1,BLZ_240312,[]
336
+ 889,gpt-4-0613,0.9528,alpacav1,BLZ_240312,[]
337
+ 890,mistral-medium,0.9682999999999999,alpacav1,BLZ_240312,[]
338
+ 891,claude-1,0.8839,alpacav1,BLZ_240312,[]
339
+ 892,claude-2.0,0.9136,alpacav1,BLZ_240312,[]
340
+ 893,gemini-pro-dev-api,0.7966,alpacav1,BLZ_240312,[]
341
+ 894,claude-2.1,0.8708,alpacav1,BLZ_240312,[]
342
+ 895,gpt-3.5-turbo-0613,0.8937,alpacav1,BLZ_240312,[]
343
+ 896,mixtral-8x7b-instruct-v0.1,0.9478,alpacav1,BLZ_240312,[]
344
+ 897,yi-34b-chat,0.9408,alpacav1,BLZ_240312,[]
345
+ 898,gemini-pro,0.7966,alpacav1,BLZ_240312,[]
346
+ 900,gpt-3.5-turbo-0314,0.8937,alpacav1,BLZ_240312,[]
347
+ 902,tulu-2-dpo-70b,0.9503,alpacav1,BLZ_240312,[]
348
+ 903,vicuna-33b,0.8898999999999999,alpacav1,BLZ_240312,[]
349
+ 904,starling-lm-7b-alpha,0.9198999999999999,alpacav1,BLZ_240312,[]
350
+ 906,llama-2-70b-chat,0.9266,alpacav1,BLZ_240312,[]
351
+ 909,openchat-3.5,0.8851,alpacav1,BLZ_240312,[]
352
+ 911,gpt-3.5-turbo-1106,0.8626,alpacav1,BLZ_240312,[]
353
+ 914,wizardlm-13b-v1.2,0.8917,alpacav1,BLZ_240312,[]
354
+ 915,zephyr-7b-beta,0.9059999999999999,alpacav1,BLZ_240312,[]
355
+ 918,llama-2-13b-chat,0.8109000000000001,alpacav1,BLZ_240312,[]
356
+ 921,zephyr-7b-alpha,0.8576,alpacav1,BLZ_240312,[]
357
+ 924,guanaco-33b,0.6596,alpacav1,BLZ_240312,[]
358
+ 925,llama-2-7b-chat,0.7137,alpacav1,BLZ_240312,[]
359
+ 934,chatglm2-6b,0.47130000000000005,alpacav1,BLZ_240312,[]
360
+ 937,openassistant-pythia-12b,0.2596,alpacav1,BLZ_240312,[]
361
+ 827,gpt-4-1106-preview,0.5,alpacav2,BLZ_240312,[]
362
+ 829,gpt-4-0314,0.221,alpacav2,BLZ_240312,[]
363
+ 830,gpt-4-0613,0.158,alpacav2,BLZ_240312,[]
364
+ 831,mistral-medium,0.21899999999999997,alpacav2,BLZ_240312,[]
365
+ 832,claude-1,0.17,alpacav2,BLZ_240312,[]
366
+ 833,claude-2.0,0.172,alpacav2,BLZ_240312,[]
367
+ 834,gemini-pro-dev-api,0.16899999999999998,alpacav2,BLZ_240312,[]
368
+ 835,claude-2.1,0.157,alpacav2,BLZ_240312,[]
369
+ 836,gpt-3.5-turbo-0613,0.141,alpacav2,BLZ_240312,[]
370
+ 837,mixtral-8x7b-instruct-v0.1,0.183,alpacav2,BLZ_240312,[]
371
+ 838,yi-34b-chat,0.297,alpacav2,BLZ_240312,[]
372
+ 839,gemini-pro,0.16899999999999998,alpacav2,BLZ_240312,[]
373
+ 840,claude-instant-1,0.161,alpacav2,BLZ_240312,[]
374
+ 841,gpt-3.5-turbo-0314,0.096,alpacav2,BLZ_240312,[]
375
+ 842,wizardlm-70b-v1.0,0.14400000000000002,alpacav2,BLZ_240312,[]
376
+ 843,tulu-2-dpo-70b,0.16,alpacav2,BLZ_240312,[]
377
+ 844,vicuna-33b,0.127,alpacav2,BLZ_240312,[]
378
+ 845,starling-lm-7b-alpha,0.142,alpacav2,BLZ_240312,[]
379
+ 846,deepseek-llm-67b-chat,0.121,alpacav2,BLZ_240312,[]
380
+ 847,llama-2-70b-chat,0.139,alpacav2,BLZ_240312,[]
381
+ 849,openhermes-2.5-mistral-7b,0.10300000000000001,alpacav2,BLZ_240312,[]
382
+ 852,gpt-3.5-turbo-1106,0.092,alpacav2,BLZ_240312,[]
383
+ 854,dolphin-2.2.1-mistral-7b,0.09,alpacav2,BLZ_240312,[]
384
+ 855,wizardlm-13b-v1.2,0.12,alpacav2,BLZ_240312,[]
385
+ 856,zephyr-7b-beta,0.11,alpacav2,BLZ_240312,[]
386
+ 859,llama-2-13b-chat,0.077,alpacav2,BLZ_240312,[]
387
+ 860,vicuna-13b,0.067,alpacav2,BLZ_240312,[]
388
+ 862,zephyr-7b-alpha,0.084,alpacav2,BLZ_240312,[]
389
+ 863,qwen-14b-chat,0.075,alpacav2,BLZ_240312,[]
390
+ 865,guanaco-33b,0.05,alpacav2,BLZ_240312,[]
391
+ 866,llama-2-7b-chat,0.0496,alpacav2,BLZ_240312,[]
392
+ 870,vicuna-7b,0.048,alpacav2,BLZ_240312,[]
393
+ 875,chatglm2-6b,0.027999999999999997,alpacav2,BLZ_240312,[]
394
+ 878,openassistant-pythia-12b,0.018000000000000002,alpacav2,BLZ_240312,[]
395
+ 1299,gpt-4-1106-preview,0.32799999999999996,alpacaeval2-lc,BLZ_240312,[]
396
+ 1301,gpt-4-0314,0.21600000000000003,alpacaeval2-lc,BLZ_240312,[]
397
+ 1302,gpt-4-0613,0.18600000000000003,alpacaeval2-lc,BLZ_240312,[]
398
+ 1303,mistral-medium,0.196,alpacaeval2-lc,BLZ_240312,[]
399
+ 1304,claude-1,0.21100000000000002,alpacaeval2-lc,BLZ_240312,[]
400
+ 1305,claude-2.0,0.21600000000000003,alpacaeval2-lc,BLZ_240312,[]
401
+ 1306,gemini-pro-dev-api,0.172,alpacaeval2-lc,BLZ_240312,[]
402
+ 1307,claude-2.1,0.193,alpacaeval2-lc,BLZ_240312,[]
403
+ 1308,gpt-3.5-turbo-0613,0.14300000000000002,alpacaeval2-lc,BLZ_240312,[]
404
+ 1309,mixtral-8x7b-instruct-v0.1,0.168,alpacaeval2-lc,BLZ_240312,[]
405
+ 1310,yi-34b-chat,0.188,alpacaeval2-lc,BLZ_240312,[]
406
+ 1312,claude-instant-1,0.195,alpacaeval2-lc,BLZ_240312,[]
407
+ 1313,gpt-3.5-turbo-0314,0.156,alpacaeval2-lc,BLZ_240312,[]
408
+ 1314,wizardlm-70b-v1.0,0.125,alpacaeval2-lc,BLZ_240312,[]
409
+ 1315,tulu-2-dpo-70b,0.151,alpacaeval2-lc,BLZ_240312,[]
410
+ 1316,vicuna-33b,0.115,alpacaeval2-lc,BLZ_240312,[]
411
+ 1317,starling-lm-7b-alpha,0.10099999999999999,alpacaeval2-lc,BLZ_240312,[]
412
+ 1318,deepseek-llm-67b-chat,0.141,alpacaeval2-lc,BLZ_240312,[]
413
+ 1319,llama-2-70b-chat,0.10400000000000001,alpacaeval2-lc,BLZ_240312,[]
414
+ 1321,openhermes-2.5-mistral-7b,0.126,alpacaeval2-lc,BLZ_240312,[]
415
+ 1324,gpt-3.5-turbo-1106,0.155,alpacaeval2-lc,BLZ_240312,[]
416
+ 1326,dolphin-2.2.1-mistral-7b,0.10800000000000001,alpacaeval2-lc,BLZ_240312,[]
417
+ 1327,wizardlm-13b-v1.2,0.099,alpacaeval2-lc,BLZ_240312,[]
418
+ 1328,zephyr-7b-beta,0.102,alpacaeval2-lc,BLZ_240312,[]
419
+ 1331,llama-2-13b-chat,0.068,alpacaeval2-lc,BLZ_240312,[]
420
+ 1332,vicuna-13b,0.085,alpacaeval2-lc,BLZ_240312,[]
421
+ 1334,zephyr-7b-alpha,0.086,alpacaeval2-lc,BLZ_240312,[]
422
+ 1335,qwen-14b-chat,0.1,alpacaeval2-lc,BLZ_240312,[]
423
+ 1338,llama-2-7b-chat,0.045,alpacaeval2-lc,BLZ_240312,[]
424
+ 1342,vicuna-7b,0.06,alpacaeval2-lc,BLZ_240312,[]
425
+ 0,gpt-4-0125-preview,1.0,arena-elo,BLZ_240312,[]
426
+ 1,gpt-4-1106-preview,0.9992019154030327,arena-elo,BLZ_240312,[]
427
+ 2,bard-gemini-pro,0.9768555466879489,arena-elo,BLZ_240312,[]
428
+ 3,gpt-4-0314,0.9497206703910615,arena-elo,BLZ_240312,[]
429
+ 4,gpt-4-0613,0.9273743016759777,arena-elo,BLZ_240312,[]
430
+ 5,mistral-medium,0.9177972865123704,arena-elo,BLZ_240312,[]
431
+ 6,claude-1,0.9169992019154031,arena-elo,BLZ_240312,[]
432
+ 7,claude-2.0,0.9034317637669593,arena-elo,BLZ_240312,[]
433
+ 8,gemini-pro-dev-api,0.8938547486033519,arena-elo,BLZ_240312,[]
434
+ 9,claude-2.1,0.8930566640063847,arena-elo,BLZ_240312,[]
435
+ 10,gpt-3.5-turbo-0613,0.8922585794094174,arena-elo,BLZ_240312,[]
436
+ 11,mixtral-8x7b-instruct-v0.1,0.8922585794094174,arena-elo,BLZ_240312,[]
437
+ 12,yi-34b-chat,0.8898643256185156,arena-elo,BLZ_240312,[]
438
+ 13,gemini-pro,0.8890662410215483,arena-elo,BLZ_240312,[]
439
+ 14,claude-instant-1,0.8850758180367119,arena-elo,BLZ_240312,[]
440
+ 15,gpt-3.5-turbo-0314,0.8818834796488427,arena-elo,BLZ_240312,[]
441
+ 16,wizardlm-70b-v1.0,0.8818834796488427,arena-elo,BLZ_240312,[]
442
+ 17,tulu-2-dpo-70b,0.8810853950518756,arena-elo,BLZ_240312,[]
443
+ 18,vicuna-33b,0.8723064644852354,arena-elo,BLZ_240312,[]
444
+ 19,starling-lm-7b-alpha,0.8699122106943336,arena-elo,BLZ_240312,[]
445
+ 20,deepseek-llm-67b-chat,0.8635275339185954,arena-elo,BLZ_240312,[]
446
+ 21,llama-2-70b-chat,0.8635275339185954,arena-elo,BLZ_240312,[]
447
+ 22,nv-llama2-70b-steerlm-chat,0.8603351955307262,arena-elo,BLZ_240312,[]
448
+ 23,openhermes-2.5-mistral-7b,0.8603351955307262,arena-elo,BLZ_240312,[]
449
+ 24,openchat-3.5,0.8587390263367917,arena-elo,BLZ_240312,[]
450
+ 25,pplx-70b-online,0.8587390263367917,arena-elo,BLZ_240312,[]
451
+ 26,gpt-3.5-turbo-1106,0.8547486033519553,arena-elo,BLZ_240312,[]
452
+ 27,solar-10.7b-instruct-v1.0,0.8499600957701516,arena-elo,BLZ_240312,[]
453
+ 28,dolphin-2.2.1-mistral-7b,0.8499600957701516,arena-elo,BLZ_240312,[]
454
+ 29,wizardlm-13b-v1.2,0.8443735035913806,arena-elo,BLZ_240312,[]
455
+ 30,zephyr-7b-beta,0.8387869114126097,arena-elo,BLZ_240312,[]
456
+ 31,mpt-30b-chat,0.8332003192338387,arena-elo,BLZ_240312,[]
457
+ 32,codellama-34b-instruct,0.8324022346368715,arena-elo,BLZ_240312,[]
458
+ 33,llama-2-13b-chat,0.8316041500399042,arena-elo,BLZ_240312,[]
459
+ 34,vicuna-13b,0.8300079808459697,arena-elo,BLZ_240312,[]
460
+ 35,pplx-7b-online,0.8284118116520351,arena-elo,BLZ_240312,[]
461
+ 36,zephyr-7b-alpha,0.8276137270550679,arena-elo,BLZ_240312,[]
462
+ 37,qwen-14b-chat,0.825219473264166,arena-elo,BLZ_240312,[]
463
+ 38,falcon-180b-chat,0.8236233040702314,arena-elo,BLZ_240312,[]
464
+ 39,guanaco-33b,0.8236233040702314,arena-elo,BLZ_240312,[]
465
+ 40,llama-2-7b-chat,0.8172386272944933,arena-elo,BLZ_240312,[]
466
+ 41,stripedhyena-nous-7b,0.8140462889066241,arena-elo,BLZ_240312,[]
467
+ 42,mistral-7b-instruct-v0.1,0.8028731045490822,arena-elo,BLZ_240312,[]
468
+ 43,palm-chat-bison-001,0.8028731045490822,arena-elo,BLZ_240312,[]
469
+ 44,vicuna-7b,0.8020750199521149,arena-elo,BLZ_240312,[]
470
+ 45,koala-13b,0.770949720670391,arena-elo,BLZ_240312,[]
471
+ 46,chatglm3-6b,0.7661612130885874,arena-elo,BLZ_240312,[]
472
+ 47,gpt4all-13b-snoozy,0.74780526735834,arena-elo,BLZ_240312,[]
473
+ 48,mpt-7b-chat,0.7430167597765364,arena-elo,BLZ_240312,[]
474
+ 49,chatglm2-6b,0.7422186751795691,arena-elo,BLZ_240312,[]
475
+ 50,rwkv-4-raven-14b,0.7382282521947326,arena-elo,BLZ_240312,[]
476
+ 51,alpaca-13b,0.7214684756584198,arena-elo,BLZ_240312,[]
477
+ 52,openassistant-pythia-12b,0.7158818834796489,arena-elo,BLZ_240312,[]
478
+ 53,chatglm-6b,0.704708699122107,arena-elo,BLZ_240312,[]
479
+ 54,fastchat-t5-3b,0.6975259377494014,arena-elo,BLZ_240312,[]
480
+ 55,stablelm-tuned-alpha-7b,0.6743814844373504,arena-elo,BLZ_240312,[]
481
+ 56,dolly-v2-12b,0.6568236233040702,arena-elo,BLZ_240312,[]
482
+ 57,llama-13b,0.6384676775738228,arena-elo,BLZ_240312,[]
483
+ 542,mixtral-8x7b-instruct-v0.1,0.7641,gpt4all,BLZ_240312,[]
484
+ 543,yi-34b-chat,0.7212999999999999,gpt4all,BLZ_240312,[]
485
+ 550,starling-lm-7b-alpha,0.7272,gpt4all,BLZ_240312,[]
486
+ 554,openhermes-2.5-mistral-7b,0.7312000000000001,gpt4all,BLZ_240312,[]
487
+ 555,openchat-3.5,0.7292000000000001,gpt4all,BLZ_240312,[]
488
+ 558,solar-10.7b-instruct-v1.0,0.7511,gpt4all,BLZ_240312,[]
489
+ 559,dolphin-2.2.1-mistral-7b,0.7223999999999999,gpt4all,BLZ_240312,[]
490
+ 561,zephyr-7b-beta,0.7182999999999999,gpt4all,BLZ_240312,[]
491
+ 565,vicuna-13b,0.631,gpt4all,BLZ_240312,[]
492
+ 567,zephyr-7b-alpha,0.7223999999999999,gpt4all,BLZ_240312,[]
493
+ 573,mistral-7b-instruct-v0.1,0.6795,gpt4all,BLZ_240312,[]
494
+ 575,vicuna-7b,0.61,gpt4all,BLZ_240312,[]
495
+ 576,koala-13b,0.62,gpt4all,BLZ_240312,[]
496
+ 578,gpt4all-13b-snoozy,0.653,gpt4all,BLZ_240312,[]
497
+ 579,mpt-7b-chat,0.648,gpt4all,BLZ_240312,[]
498
+ 583,openassistant-pythia-12b,0.61,gpt4all,BLZ_240312,[]
499
+ 585,fastchat-t5-3b,0.537,gpt4all,BLZ_240312,[]
500
+ 586,stablelm-tuned-alpha-7b,0.513,gpt4all,BLZ_240312,[]
501
+ 588,llama-13b,0.63,gpt4all,BLZ_240312,[]
502
+ 129,mixtral-8x7b-instruct-v0.1,0.7262000000000001,hugging-6,BLZ_240312,[]
503
+ 130,yi-34b-chat,0.6531999999999999,hugging-6,BLZ_240312,[]
504
+ 134,wizardlm-70b-v1.0,0.6125,hugging-6,BLZ_240312,[]
505
+ 135,tulu-2-dpo-70b,0.7376999999999999,hugging-6,BLZ_240312,[]
506
+ 136,vicuna-33b,0.585,hugging-6,BLZ_240312,[]
507
+ 137,starling-lm-7b-alpha,0.6713,hugging-6,BLZ_240312,[]
508
+ 139,llama-2-70b-chat,0.624,hugging-6,BLZ_240312,[]
509
+ 141,openhermes-2.5-mistral-7b,0.6152000000000001,hugging-6,BLZ_240312,[]
510
+ 142,openchat-3.5,0.6124,hugging-6,BLZ_240312,[]
511
+ 145,solar-10.7b-instruct-v1.0,0.742,hugging-6,BLZ_240312,[]
512
+ 146,dolphin-2.2.1-mistral-7b,0.6493000000000001,hugging-6,BLZ_240312,[]
513
+ 147,wizardlm-13b-v1.2,0.5476,hugging-6,BLZ_240312,[]
514
+ 148,zephyr-7b-beta,0.6195,hugging-6,BLZ_240312,[]
515
+ 149,mpt-30b-chat,0.5538000000000001,hugging-6,BLZ_240312,[]
516
+ 150,codellama-34b-instruct,0.5729,hugging-6,BLZ_240312,[]
517
+ 151,llama-2-13b-chat,0.5490999999999999,hugging-6,BLZ_240312,[]
518
+ 152,vicuna-13b,0.5539999999999999,hugging-6,BLZ_240312,[]
519
+ 154,zephyr-7b-alpha,0.595,hugging-6,BLZ_240312,[]
520
+ 156,falcon-180b-chat,0.6785,hugging-6,BLZ_240312,[]
521
+ 158,llama-2-7b-chat,0.5074000000000001,hugging-6,BLZ_240312,[]
522
+ 160,mistral-7b-instruct-v0.1,0.5496,hugging-6,BLZ_240312,[]
523
+ 162,vicuna-7b,0.521,hugging-6,BLZ_240312,[]
524
+ 176,yi-34bx2-moe-60b,0.7672,hugging-6,BLZ_240312,[]
525
+ 947,gpt-4-0314,0.93,llmonitor,BLZ_240312,[]
526
+ 948,gpt-4-0613,0.89,llmonitor,BLZ_240312,[]
527
+ 950,claude-1,0.66,llmonitor,BLZ_240312,[]
528
+ 951,claude-2.0,0.68,llmonitor,BLZ_240312,[]
529
+ 954,gpt-3.5-turbo-0613,0.81,llmonitor,BLZ_240312,[]
530
+ 958,claude-instant-1,0.6,llmonitor,BLZ_240312,[]
531
+ 959,gpt-3.5-turbo-0314,0.79,llmonitor,BLZ_240312,[]
532
+ 965,llama-2-70b-chat,0.6,llmonitor,BLZ_240312,[]
533
+ 975,mpt-30b-chat,0.4,llmonitor,BLZ_240312,[]
534
+ 976,codellama-34b-instruct,0.34,llmonitor,BLZ_240312,[]
535
+ 977,llama-2-13b-chat,0.5,llmonitor,BLZ_240312,[]
536
+ 978,vicuna-13b,0.5,llmonitor,BLZ_240312,[]
537
+ 982,falcon-180b-chat,0.67,llmonitor,BLZ_240312,[]
538
+ 983,guanaco-33b,0.43,llmonitor,BLZ_240312,[]
539
+ 984,llama-2-7b-chat,0.5,llmonitor,BLZ_240312,[]
540
+ 986,mistral-7b-instruct-v0.1,0.57,llmonitor,BLZ_240312,[]
541
+ 987,palm-chat-bison-001,0.57,llmonitor,BLZ_240312,[]
542
+ 988,vicuna-7b,0.41,llmonitor,BLZ_240312,[]
543
+ 989,koala-13b,0.31,llmonitor,BLZ_240312,[]
544
+ 992,mpt-7b-chat,0.43,llmonitor,BLZ_240312,[]
545
+ 1000,dolly-v2-12b,0.23,llmonitor,BLZ_240312,[]
546
+ 59,gpt-4-0125-preview,0.0929,mt-bench,BLZ_240312,[]
547
+ 60,gpt-4-1106-preview,0.0932,mt-bench,BLZ_240312,[]
548
+ 62,gpt-4-0314,0.08960000000000001,mt-bench,BLZ_240312,[]
549
+ 63,gpt-4-0613,0.09179999999999999,mt-bench,BLZ_240312,[]
550
+ 64,mistral-medium,0.0861,mt-bench,BLZ_240312,[]
551
+ 65,claude-1,0.079,mt-bench,BLZ_240312,[]
552
+ 66,claude-2.0,0.0806,mt-bench,BLZ_240312,[]
553
+ 67,gemini-pro-dev-api,0.08039999999999999,mt-bench,BLZ_240312,[]
554
+ 68,claude-2.1,0.0818,mt-bench,BLZ_240312,[]
555
+ 69,gpt-3.5-turbo-0613,0.0839,mt-bench,BLZ_240312,[]
556
+ 70,mixtral-8x7b-instruct-v0.1,0.083,mt-bench,BLZ_240312,[]
557
+ 71,yi-34b-chat,0.07769999999999999,mt-bench,BLZ_240312,[]
558
+ 72,gemini-pro,0.08039999999999999,mt-bench,BLZ_240312,[]
559
+ 73,claude-instant-1,0.0785,mt-bench,BLZ_240312,[]
560
+ 74,gpt-3.5-turbo-0314,0.0794,mt-bench,BLZ_240312,[]
561
+ 75,wizardlm-70b-v1.0,0.0771,mt-bench,BLZ_240312,[]
562
+ 76,tulu-2-dpo-70b,0.0789,mt-bench,BLZ_240312,[]
563
+ 77,vicuna-33b,0.0712,mt-bench,BLZ_240312,[]
564
+ 78,starling-lm-7b-alpha,0.0809,mt-bench,BLZ_240312,[]
565
+ 79,deepseek-llm-67b-chat,0.08529999999999999,mt-bench,BLZ_240312,[]
566
+ 80,llama-2-70b-chat,0.06860000000000001,mt-bench,BLZ_240312,[]
567
+ 81,nv-llama2-70b-steerlm-chat,0.0754,mt-bench,BLZ_240312,[]
568
+ 82,openhermes-2.5-mistral-7b,0.07690000000000001,mt-bench,BLZ_240312,[]
569
+ 83,openchat-3.5,0.0781,mt-bench,BLZ_240312,[]
570
+ 84,pplx-70b-online,0.0588,mt-bench,BLZ_240312,[]
571
+ 85,gpt-3.5-turbo-1106,0.0832,mt-bench,BLZ_240312,[]
572
+ 86,solar-10.7b-instruct-v1.0,0.0758,mt-bench,BLZ_240312,[]
573
+ 88,wizardlm-13b-v1.2,0.07200000000000001,mt-bench,BLZ_240312,[]
574
+ 89,zephyr-7b-beta,0.07339999999999999,mt-bench,BLZ_240312,[]
575
+ 90,mpt-30b-chat,0.0639,mt-bench,BLZ_240312,[]
576
+ 92,llama-2-13b-chat,0.0665,mt-bench,BLZ_240312,[]
577
+ 93,vicuna-13b,0.06570000000000001,mt-bench,BLZ_240312,[]
578
+ 95,zephyr-7b-alpha,0.0688,mt-bench,BLZ_240312,[]
579
+ 96,qwen-14b-chat,0.0696,mt-bench,BLZ_240312,[]
580
+ 98,guanaco-33b,0.0653,mt-bench,BLZ_240312,[]
581
+ 99,llama-2-7b-chat,0.06269999999999999,mt-bench,BLZ_240312,[]
582
+ 101,mistral-7b-instruct-v0.1,0.0684,mt-bench,BLZ_240312,[]
583
+ 102,palm-chat-bison-001,0.064,mt-bench,BLZ_240312,[]
584
+ 103,vicuna-7b,0.0617,mt-bench,BLZ_240312,[]
585
+ 104,koala-13b,0.0535,mt-bench,BLZ_240312,[]
586
+ 106,gpt4all-13b-snoozy,0.0541,mt-bench,BLZ_240312,[]
587
+ 107,mpt-7b-chat,0.0542,mt-bench,BLZ_240312,[]
588
+ 108,chatglm2-6b,0.0496,mt-bench,BLZ_240312,[]
589
+ 109,rwkv-4-raven-14b,0.0398,mt-bench,BLZ_240312,[]
590
+ 110,alpaca-13b,0.0453,mt-bench,BLZ_240312,[]
591
+ 111,openassistant-pythia-12b,0.0432,mt-bench,BLZ_240312,[]
592
+ 112,chatglm-6b,0.045,mt-bench,BLZ_240312,[]
593
+ 113,fastchat-t5-3b,0.0304,mt-bench,BLZ_240312,[]
594
+ 114,stablelm-tuned-alpha-7b,0.0275,mt-bench,BLZ_240312,[]
595
+ 115,dolly-v2-12b,0.032799999999999996,mt-bench,BLZ_240312,[]
596
+ 116,llama-13b,0.026099999999999998,mt-bench,BLZ_240312,[]
597
+ 0,gpt-4-0613,0.957,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
598
+ 1,llama-3-70b,0.902,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
599
+ 2,mixtral-8x22b,0.855,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
600
+ 3,palmyra-x-v3-72b,0.826,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
601
+ 4,gpt-4-turbo-1106-preview,0.821,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
602
+ 5,palm-2-unicorn,0.781,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
603
+ 6,claude-3-opus-20240229,0.762,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
604
+ 7,qwen1.5-72b,0.757,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
605
+ 8,palmyra-x-v2-33b,0.736,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
606
+ 9,yi-34b,0.723,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
607
+ 10,qwen1.5-32b,0.689,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
608
+ 11,claude-v1.3,0.689,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
609
+ 12,mixtral-8x7b-32k-seqlen,0.679,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
610
+ 13,palm-2-bison,0.655,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
611
+ 14,claude-2.0,0.651,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
612
+ 15,deepseek-llm-67b-chat,0.645,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
613
+ 16,llama-2-70b,0.609,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
614
+ 17,claude-2.1,0.594,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
615
+ 18,gpt-3.5-text-davinci-003,0.577,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
616
+ 19,qwen1.5-14b,0.574,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
617
+ 20,claude-instant-1.2,0.551,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
618
+ 21,llama-3-8b,0.519,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
619
+ 22,gpt-3.5-turbo-0613,0.502,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
620
+ 23,gemma-7b,0.47,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
621
+ 24,claude-3-sonnet-20240229,0.468,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
622
+ 25,gpt-3.5-text-davinci-002,0.468,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
623
+ 26,llama-65b,0.466,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
624
+ 27,mistral-large-2402,0.46,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
625
+ 28,cohere-command,0.421,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
626
+ 29,dbrx-instructruct,0.419,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
627
+ 30,mistral-v0.1-7b,0.415,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
628
+ 31,mistral-small-2402,0.415,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
629
+ 32,mistral-medium-2312,0.383,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
630
+ 33,qwen1.5-7b,0.377,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
631
+ 34,claude-3-haiku-20240307,0.377,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
632
+ 35,yi-6b,0.351,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
633
+ 36,llama-2-13b,0.332,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
634
+ 37,jurassic-2-jumbo-178b,0.317,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
635
+ 38,falcon-40b,0.306,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
636
+ 39,phi-2,0.26,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
637
+ 40,jurassic-2-grande-17b,0.253,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
638
+ 41,llama-2-7b,0.234,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
639
+ 42,luminous-supreme-70b,0.213,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
640
+ 43,cohere-command-light,0.166,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
641
+ 44,luminous-extended-30b,0.119,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
642
+ 45,falcon-7b,0.1,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
643
+ 46,olmo-7b,0.083,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
644
+ 47,luminous-base-13b,0.072,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
645
+ 0,llama-2-70b,0.944,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
646
+ 1,llama-65b,0.908,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
647
+ 2,text-davinci-002,0.905,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
648
+ 3,mistral-v0.1-7b,0.884,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
649
+ 4,cohere-command-beta-52.4b,0.874,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
650
+ 5,text-davinci-003,0.872,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
651
+ 6,jurassic-2-jumbo-178b,0.824,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
652
+ 7,llama-2-13b,0.823,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
653
+ 8,tnlg-v2-530b,0.787,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
654
+ 9,gpt-3.5-turbo-0613,0.783,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
655
+ 10,llama-30b,0.781,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
656
+ 11,anthropic-lm-v4-s3-52b,0.78,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
657
+ 12,gpt-3.5-turbo-0301,0.76,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
658
+ 13,jurassic-2-grande-17b,0.743,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
659
+ 14,palmyra-x-43b,0.732,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
660
+ 15,falcon-40b,0.729,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
661
+ 16,falcon-instruct-40b,0.727,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
662
+ 17,mpt-instruct-30b,0.716,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
663
+ 18,mpt-30b,0.714,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
664
+ 19,j1-grande-v2-beta-17b,0.706,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
665
+ 20,vicuna-v1.3-13b,0.706,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
666
+ 21,cohere-command-beta-6.1b,0.675,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
667
+ 22,cohere-xlarge-v20221108-52.4b,0.664,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
668
+ 23,luminous-supreme-70b,0.662,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
669
+ 24,vicuna-v1.3-7b,0.625,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
670
+ 25,opt-175b,0.609,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
671
+ 26,llama-2-7b,0.607,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
672
+ 27,llama-13b,0.595,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
673
+ 28,instructpalmyra-30b,0.568,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
674
+ 29,cohere-xlarge-v20220609-52.4b,0.56,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
675
+ 30,jurassic-2-large-7.5b,0.553,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
676
+ 31,davinci-175b,0.538,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
677
+ 32,llama-7b,0.533,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
678
+ 33,redpajama-incite-instruct-7b,0.524,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
679
+ 34,j1-jumbo-v1-178b,0.517,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
680
+ 35,glm-130b,0.512,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
681
+ 36,luminous-extended-30b,0.485,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
682
+ 37,opt-66b,0.448,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
683
+ 38,bloom-176b,0.446,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
684
+ 39,j1-grande-v1-17b,0.433,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
685
+ 40,alpaca-7b,0.381,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
686
+ 41,falcon-7b,0.378,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
687
+ 42,redpajama-incite-base-7b,0.378,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
688
+ 43,cohere-large-v20220720-13.1b,0.372,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
689
+ 44,redpajama-incite-instruct-v1-3b,0.366,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
690
+ 45,text-curie-001,0.36,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
691
+ 46,gpt-neox-20b,0.351,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
692
+ 47,luminous-base-13b,0.315,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
693
+ 48,cohere-medium-v20221108-6.1b,0.312,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
694
+ 49,redpajama-incite-base-v1-3b,0.311,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
695
+ 50,tnlg-v2-6.7b,0.309,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
696
+ 51,j1-large-v1-7.5b,0.285,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
697
+ 52,gpt-j-6b,0.273,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
698
+ 53,pythia-12b,0.257,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
699
+ 54,curie-6.7b,0.247,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
700
+ 55,falcon-instruct-7b,0.244,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
701
+ 56,cohere-medium-v20220720-6.1b,0.23,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
702
+ 57,text-babbage-001,0.229,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
703
+ 58,t0pp-11b,0.197,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
704
+ 59,pythia-6.9b,0.196,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
705
+ 60,ul2-20b,0.167,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
706
+ 61,t5-11b,0.131,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
707
+ 62,babbage-1.3b,0.114,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
708
+ 63,cohere-small-v20220720-410m,0.109,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
709
+ 64,ada-350m,0.108,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
710
+ 65,text-ada-001,0.107,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
711
+ 66,yalm-100b,0.075,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
712
+ 67,llama-2-70b,0.582,mmlu,helm_classic_240130,[]
713
+ 68,llama-65b,0.584,mmlu,helm_classic_240130,[]
714
+ 69,text-davinci-002,0.568,mmlu,helm_classic_240130,[]
715
+ 70,mistral-v0.1-7b,0.572,mmlu,helm_classic_240130,[]
716
+ 71,cohere-command-beta-52.4b,0.452,mmlu,helm_classic_240130,[]
717
+ 72,text-davinci-003,0.569,mmlu,helm_classic_240130,[]
718
+ 73,jurassic-2-jumbo-178b,0.48,mmlu,helm_classic_240130,[]
719
+ 74,llama-2-13b,0.507,mmlu,helm_classic_240130,[]
720
+ 75,tnlg-v2-530b,0.469,mmlu,helm_classic_240130,[]
721
+ 76,gpt-3.5-turbo-0613,0.391,mmlu,helm_classic_240130,[]
722
+ 77,llama-30b,0.531,mmlu,helm_classic_240130,[]
723
+ 78,anthropic-lm-v4-s3-52b,0.481,mmlu,helm_classic_240130,[]
724
+ 79,gpt-3.5-turbo-0301,0.59,mmlu,helm_classic_240130,[]
725
+ 80,jurassic-2-grande-17b,0.475,mmlu,helm_classic_240130,[]
726
+ 81,palmyra-x-43b,0.609,mmlu,helm_classic_240130,[]
727
+ 82,falcon-40b,0.509,mmlu,helm_classic_240130,[]
728
+ 83,falcon-instruct-40b,0.497,mmlu,helm_classic_240130,[]
729
+ 84,mpt-instruct-30b,0.444,mmlu,helm_classic_240130,[]
730
+ 85,mpt-30b,0.437,mmlu,helm_classic_240130,[]
731
+ 86,j1-grande-v2-beta-17b,0.445,mmlu,helm_classic_240130,[]
732
+ 87,vicuna-v1.3-13b,0.462,mmlu,helm_classic_240130,[]
733
+ 88,cohere-command-beta-6.1b,0.406,mmlu,helm_classic_240130,[]
734
+ 89,cohere-xlarge-v20221108-52.4b,0.382,mmlu,helm_classic_240130,[]
735
+ 90,luminous-supreme-70b,0.38,mmlu,helm_classic_240130,[]
736
+ 91,vicuna-v1.3-7b,0.434,mmlu,helm_classic_240130,[]
737
+ 92,opt-175b,0.318,mmlu,helm_classic_240130,[]
738
+ 93,llama-2-7b,0.431,mmlu,helm_classic_240130,[]
739
+ 94,llama-13b,0.422,mmlu,helm_classic_240130,[]
740
+ 95,instructpalmyra-30b,0.403,mmlu,helm_classic_240130,[]
741
+ 96,cohere-xlarge-v20220609-52.4b,0.353,mmlu,helm_classic_240130,[]
742
+ 97,jurassic-2-large-7.5b,0.339,mmlu,helm_classic_240130,[]
743
+ 98,davinci-175b,0.422,mmlu,helm_classic_240130,[]
744
+ 99,llama-7b,0.321,mmlu,helm_classic_240130,[]
745
+ 100,redpajama-incite-instruct-7b,0.363,mmlu,helm_classic_240130,[]
746
+ 101,j1-jumbo-v1-178b,0.259,mmlu,helm_classic_240130,[]
747
+ 102,glm-130b,0.344,mmlu,helm_classic_240130,[]
748
+ 103,luminous-extended-30b,0.321,mmlu,helm_classic_240130,[]
749
+ 104,opt-66b,0.276,mmlu,helm_classic_240130,[]
750
+ 105,bloom-176b,0.299,mmlu,helm_classic_240130,[]
751
+ 106,j1-grande-v1-17b,0.27,mmlu,helm_classic_240130,[]
752
+ 107,alpaca-7b,0.385,mmlu,helm_classic_240130,[]
753
+ 108,falcon-7b,0.286,mmlu,helm_classic_240130,[]
754
+ 109,redpajama-incite-base-7b,0.302,mmlu,helm_classic_240130,[]
755
+ 110,cohere-large-v20220720-13.1b,0.324,mmlu,helm_classic_240130,[]
756
+ 111,redpajama-incite-instruct-v1-3b,0.257,mmlu,helm_classic_240130,[]
757
+ 112,text-curie-001,0.237,mmlu,helm_classic_240130,[]
758
+ 113,gpt-neox-20b,0.276,mmlu,helm_classic_240130,[]
759
+ 114,luminous-base-13b,0.27,mmlu,helm_classic_240130,[]
760
+ 115,cohere-medium-v20221108-6.1b,0.254,mmlu,helm_classic_240130,[]
761
+ 116,redpajama-incite-base-v1-3b,0.263,mmlu,helm_classic_240130,[]
762
+ 117,tnlg-v2-6.7b,0.242,mmlu,helm_classic_240130,[]
763
+ 118,j1-large-v1-7.5b,0.241,mmlu,helm_classic_240130,[]
764
+ 119,gpt-j-6b,0.249,mmlu,helm_classic_240130,[]
765
+ 120,pythia-12b,0.274,mmlu,helm_classic_240130,[]
766
+ 121,curie-6.7b,0.243,mmlu,helm_classic_240130,[]
767
+ 122,falcon-instruct-7b,0.275,mmlu,helm_classic_240130,[]
768
+ 123,cohere-medium-v20220720-6.1b,0.279,mmlu,helm_classic_240130,[]
769
+ 124,text-babbage-001,0.229,mmlu,helm_classic_240130,[]
770
+ 125,t0pp-11b,0.407,mmlu,helm_classic_240130,[]
771
+ 126,pythia-6.9b,0.236,mmlu,helm_classic_240130,[]
772
+ 127,ul2-20b,0.291,mmlu,helm_classic_240130,[]
773
+ 128,t5-11b,0.29,mmlu,helm_classic_240130,[]
774
+ 129,babbage-1.3b,0.235,mmlu,helm_classic_240130,[]
775
+ 130,cohere-small-v20220720-410m,0.264,mmlu,helm_classic_240130,[]
776
+ 131,ada-350m,0.243,mmlu,helm_classic_240130,[]
777
+ 132,text-ada-001,0.238,mmlu,helm_classic_240130,[]
778
+ 133,yalm-100b,0.243,mmlu,helm_classic_240130,[]
779
+ 0,gpt-4o-0513,35.7,wildbench-mix,wildbench_240612,[]
780
+ 1,gpt-4-turbo-0409,34.6,wildbench-mix,wildbench_240612,[]
781
+ 2,gpt-4-turbo-0125,29.9,wildbench-mix,wildbench_240612,[]
782
+ 3,gemini-1.5-pro,27.8,wildbench-mix,wildbench_240612,[]
783
+ 4,llama-3-70b-inst,21.0,wildbench-mix,wildbench_240612,[]
784
+ 5,claude-3-opus,20.1,wildbench-mix,wildbench_240612,[]
785
+ 6,gemini-1.5-flash,17.4,wildbench-mix,wildbench_240612,[]
786
+ 7,yi-1.5-34b-chat,16.8,wildbench-mix,wildbench_240612,[]
787
+ 8,llama3-inst-8b-simpo,14.0,wildbench-mix,wildbench_240612,[]
788
+ 9,claude-3-sonnet,7.2,wildbench-mix,wildbench_240612,[]
789
+ 10,qwen1.5-72b-chat,4.4,wildbench-mix,wildbench_240612,[]
790
+ 11,command-r-plus,0.4,wildbench-mix,wildbench_240612,[]
791
+ 12,claude-3-haiku,-8.5,wildbench-mix,wildbench_240612,[]
792
+ 13,mistral-large,-10.5,wildbench-mix,wildbench_240612,[]
793
+ 14,starlinglm-7b-beta,-11.9,wildbench-mix,wildbench_240612,[]
794
+ 15,llama-3-8b-inst,-14.6,wildbench-mix,wildbench_240612,[]
795
+ 16,command-r,-16.0,wildbench-mix,wildbench_240612,[]
796
+ 17,mixtral-8x7b-inst,-18.8,wildbench-mix,wildbench_240612,[]
797
+ 18,dbrx-instruct,-21.6,wildbench-mix,wildbench_240612,[]
798
+ 19,yi-1.5-6b-chat,-24.3,wildbench-mix,wildbench_240612,[]
799
+ 20,mistral-7b-inst-v0.2,-25.0,wildbench-mix,wildbench_240612,[]
800
+ 21,tulu-2-dpo-70b,-25.4,wildbench-mix,wildbench_240612,[]
801
+ 22,llama-2-70b-chat,-26.8,wildbench-mix,wildbench_240612,[]
802
+ 23,qwen1.5-7b-chat,-27.0,wildbench-mix,wildbench_240612,[]
803
+ 24,phi-3-medium-128k,-33.3,wildbench-mix,wildbench_240612,[]
804
+ 25,gpt-3.5-turbo-0125,-33.5,wildbench-mix,wildbench_240612,[]
805
+ 26,llama-2-7b-chat,-48.0,wildbench-mix,wildbench_240612,[]
806
+ 27,gemma-7b-it,-57.0,wildbench-mix,wildbench_240612,[]
807
+ 28,gemma-2b-it,-74.1,wildbench-mix,wildbench_240612,[]
808
+ 13,flan-t5-xxl,0.2244897959183673,mmlu_pro,bluebench_v02,[]
809
+ 30,granite-13b-chat-v2,0.2857142857142857,mmlu_pro,bluebench_v02,[]
810
+ 41,granite-13b-instruct-v2,0.0408163265306122,mmlu_pro,bluebench_v02,[]
811
+ 50,granite-7b-lab,0.2423469387755102,mmlu_pro,bluebench_v02,[]
812
+ 60,llama-2-13b-chat,0.0943877551020408,mmlu_pro,bluebench_v02,[]
813
+ 70,llama-2-70b,0.4081632653061224,mmlu_pro,bluebench_v02,[]
814
+ 81,llama-3-70b-instruct,0.4285714285714285,mmlu_pro,bluebench_v02,[]
815
+ 92,llama-3-8b,0.375,mmlu_pro,bluebench_v02,[]
816
+ 103,llama-3-8b-instruct,0.0994897959183673,mmlu_pro,bluebench_v02,[]
817
+ 112,llama-30b,0.3061224489795918,mmlu_pro,bluebench_v02,[]
818
+ 121,llama-7b,0.1326530612244897,mmlu_pro,bluebench_v02,[]
819
+ 132,mistral-v0.1-7b,0.2857142857142857,mmlu_pro,bluebench_v02,[]
820
+ 143,mixtral-8x7b-instruct-v01,0.375,mmlu_pro,bluebench_v02,[]
821
+ 153,vicuna-13b-v1.5-16k,0.2857142857142857,mmlu_pro,bluebench_v02,[]
822
+ 162,vicuna-33b-v1.3,0.2653061224489796,mmlu_pro,bluebench_v02,[]
823
+ 172,vicuna-v1.3-7b,0.1938775510204081,mmlu_pro,bluebench_v02,[]
824
+ 182,vicuna-7b-v1.5,0.2857142857142857,mmlu_pro,bluebench_v02,[]
825
+ 192,zephyr-7b-beta,0.2959183673469387,mmlu_pro,bluebench_v02,[]
assets/combined_holistic_20240708.csv ADDED
@@ -0,0 +1,938 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model,score,scenario,source,aggragated_from
2
+ gpt_4_turbo_2024_04_09,82.6,arena_hard,arena_hard_2404,[]
3
+ gpt_4_0125_preview,78.0,arena_hard,arena_hard_2404,[]
4
+ gemini_1.5_pro_api_preview,72.0,arena_hard,arena_hard_2404,[]
5
+ yi_large,63.7,arena_hard,arena_hard_2404,[]
6
+ claude_3_opus_20240229,60.4,arena_hard,arena_hard_2404,[]
7
+ glm_4,55.7,arena_hard,arena_hard_2404,[]
8
+ gpt_4_0314,50.0,arena_hard,arena_hard_2404,[]
9
+ gemini_1.5_flash_api_preview,49.6,arena_hard,arena_hard_2404,[]
10
+ claude_3_sonnet_20240229,46.8,arena_hard,arena_hard_2404,[]
11
+ claude_3_haiku_20240307,41.5,arena_hard,arena_hard_2404,[]
12
+ llama_3_70b_chat,41.1,arena_hard,arena_hard_2404,[]
13
+ gpt_4_0613,37.9,arena_hard,arena_hard_2404,[]
14
+ mistral_large_2402,37.7,arena_hard,arena_hard_2404,[]
15
+ mixtral_8x22b_instruct_v0.1,36.4,arena_hard,arena_hard_2404,[]
16
+ qwen1.5_72b_chat,36.1,arena_hard,arena_hard_2404,[]
17
+ command_r_plus,33.1,arena_hard,arena_hard_2404,[]
18
+ mistral_medium,31.9,arena_hard,arena_hard_2404,[]
19
+ mistral_next,27.4,arena_hard,arena_hard_2404,[]
20
+ gpt_3.5_turbo_0613,24.8,arena_hard,arena_hard_2404,[]
21
+ claude_2.0,24.0,arena_hard,arena_hard_2404,[]
22
+ dbrx_instructruct,23.9,arena_hard,arena_hard_2404,[]
23
+ mixtral_8x7b_instruct_v0.1,23.4,arena_hard,arena_hard_2404,[]
24
+ gpt_3.5_turbo_0125,23.3,arena_hard,arena_hard_2404,[]
25
+ yi_34b_chat,23.1,arena_hard,arena_hard_2404,[]
26
+ starling_lm_7b_beta,23.0,arena_hard,arena_hard_2404,[]
27
+ claude_2.1,22.8,arena_hard,arena_hard_2404,[]
28
+ snorkel_mistral_pairrm_dpo,20.7,arena_hard,arena_hard_2404,[]
29
+ llama_3_8b_chat,20.6,arena_hard,arena_hard_2404,[]
30
+ gpt_3.5_turbo_1106,18.9,arena_hard,arena_hard_2404,[]
31
+ gpt_3.5_turbo_0301,18.1,arena_hard,arena_hard_2404,[]
32
+ gemini_1.0_pro,17.8,arena_hard,arena_hard_2404,[]
33
+ snowflake_arctic_instruct,17.6,arena_hard,arena_hard_2404,[]
34
+ command_r,17.0,arena_hard,arena_hard_2404,[]
35
+ phi_3_mini_128k_instruct,15.4,arena_hard,arena_hard_2404,[]
36
+ tulu_2_dpo_70b,15.0,arena_hard,arena_hard_2404,[]
37
+ starling_lm_7b_alpha,12.8,arena_hard,arena_hard_2404,[]
38
+ mistral_7b_instruct,12.6,arena_hard,arena_hard_2404,[]
39
+ gemma_1.1_7b_it,12.1,arena_hard,arena_hard_2404,[]
40
+ llama_2_70b_chat,11.6,arena_hard,arena_hard_2404,[]
41
+ vicuna_33b_v1.3,8.6,arena_hard,arena_hard_2404,[]
42
+ gemma_7b_it,7.5,arena_hard,arena_hard_2404,[]
43
+ llama_2_7b_chat,4.6,arena_hard,arena_hard_2404,[]
44
+ gemma_1.1_2b_it,3.4,arena_hard,arena_hard_2404,[]
45
+ gemma_2b_it,3.0,arena_hard,arena_hard_2404,[]
46
+ gpt_4o_2024_05_13,87.9,mixeval,mixeval_240601,[]
47
+ claude_3_opus,88.1,mixeval,mixeval_240601,[]
48
+ gpt_4_turbo_2024_04_09,88.8,mixeval,mixeval_240601,[]
49
+ gemini_1.5_pro_api_0409,84.2,mixeval,mixeval_240601,[]
50
+ yi_large_preview,84.4,mixeval,mixeval_240601,[]
51
+ llama_3_70b_instruct,84.0,mixeval,mixeval_240601,[]
52
+ qwen_max_0428,86.1,mixeval,mixeval_240601,[]
53
+ claude_3_sonnet,81.7,mixeval,mixeval_240601,[]
54
+ reka_core_20240415,83.3,mixeval,mixeval_240601,[]
55
+ mammoth2_8x7b_plus,81.5,mixeval,mixeval_240601,[]
56
+ deepseek_v2,83.7,mixeval,mixeval_240601,[]
57
+ command_r_plus,81.5,mixeval,mixeval_240601,[]
58
+ yi_1.5_34b_chat,81.7,mixeval,mixeval_240601,[]
59
+ mistral_large,84.2,mixeval,mixeval_240601,[]
60
+ qwen1.5_72b_chat,84.1,mixeval,mixeval_240601,[]
61
+ mistral_medium,81.9,mixeval,mixeval_240601,[]
62
+ gemini_1.0_pro,78.9,mixeval,mixeval_240601,[]
63
+ reka_flash_20240226,79.8,mixeval,mixeval_240601,[]
64
+ mistral_small,81.2,mixeval,mixeval_240601,[]
65
+ llama_3_8b_instruct,75.0,mixeval,mixeval_240601,[]
66
+ command_r,77.0,mixeval,mixeval_240601,[]
67
+ qwen1.5_32b_chat,81.0,mixeval,mixeval_240601,[]
68
+ gpt_3.5_turbo_0125,79.7,mixeval,mixeval_240601,[]
69
+ claude_3_haiku,79.7,mixeval,mixeval_240601,[]
70
+ yi_34b_chat,80.1,mixeval,mixeval_240601,[]
71
+ mixtral_8x7b_instruct_v0.1,76.4,mixeval,mixeval_240601,[]
72
+ starling_lm_7b_beta,74.8,mixeval,mixeval_240601,[]
73
+ yi_1.5_9b_chat,74.2,mixeval,mixeval_240601,[]
74
+ gemma_1.1_7b_it,69.6,mixeval,mixeval_240601,[]
75
+ vicuna_33b_v1.3,66.3,mixeval,mixeval_240601,[]
76
+ llama_2_70b_chat,74.6,mixeval,mixeval_240601,[]
77
+ map_neo_instruct_v0.1,70.0,mixeval,mixeval_240601,[]
78
+ mistral_7b_instruct_v0.2,70.0,mixeval,mixeval_240601,[]
79
+ qwen1.5_7b_chat,71.4,mixeval,mixeval_240601,[]
80
+ reka_edge_20240208,68.5,mixeval,mixeval_240601,[]
81
+ zephyr_7b_beta,69.1,mixeval,mixeval_240601,[]
82
+ llama_2_7b_chat,61.7,mixeval,mixeval_240601,[]
83
+ yi_6b_chat,65.6,mixeval,mixeval_240601,[]
84
+ qwen1.5_moe_a2.7b_chat,69.1,mixeval,mixeval_240601,[]
85
+ gemma_1.1_2b_it,51.9,mixeval,mixeval_240601,[]
86
+ vicuna_7b_v1.5,60.3,mixeval,mixeval_240601,[]
87
+ olmo_7b_instruct,55.0,mixeval,mixeval_240601,[]
88
+ qwen1.5_4b_chat,57.2,mixeval,mixeval_240601,[]
89
+ jetmoe_8b_chat,51.6,mixeval,mixeval_240601,[]
90
+ mpt_7b_chat,43.8,mixeval,mixeval_240601,[]
91
+ llama_3_70b,82.2,mixeval,mixeval_240601,[]
92
+ qwen1.5_72b,79.5,mixeval,mixeval_240601,[]
93
+ yi_34b,78.3,mixeval,mixeval_240601,[]
94
+ qwen1.5_32b,77.6,mixeval,mixeval_240601,[]
95
+ mixtral_8x7b,74.0,mixeval,mixeval_240601,[]
96
+ llama_2_70b,73.2,mixeval,mixeval_240601,[]
97
+ qwen1.5_moe_a2.7b,70.2,mixeval,mixeval_240601,[]
98
+ qwen1.5_7b,68.2,mixeval,mixeval_240601,[]
99
+ llama_3_8b,65.1,mixeval,mixeval_240601,[]
100
+ mistral_7b,64.8,mixeval,mixeval_240601,[]
101
+ gemma_7b,64.7,mixeval,mixeval_240601,[]
102
+ yi_6b,63.1,mixeval,mixeval_240601,[]
103
+ qwen1.5_4b,58.2,mixeval,mixeval_240601,[]
104
+ jetmoe_8b,57.1,mixeval,mixeval_240601,[]
105
+ deepseek_7b,52.2,mixeval,mixeval_240601,[]
106
+ phi_2,51.9,mixeval,mixeval_240601,[]
107
+ deepseekmoe_16b,51.4,mixeval,mixeval_240601,[]
108
+ llama_2_7b,43.1,mixeval,mixeval_240601,[]
109
+ gemma_2b,38.9,mixeval,mixeval_240601,[]
110
+ olmo_7b,31.8,mixeval,mixeval_240601,[]
111
+ mpt_7b,30.8,mixeval,mixeval_240601,[]
112
+ gpt_4_0314,0.57,agieval,BLZ_240312,[]
113
+ gpt_4_0613,0.57,agieval,BLZ_240312,[]
114
+ claude_1,0.49700000000000005,agieval,BLZ_240312,[]
115
+ mixtral_8x7b_instruct_v0.1,0.45299999999999996,agieval,BLZ_240312,[]
116
+ yi_34b_chat,0.508,agieval,BLZ_240312,[]
117
+ gpt_3.5_turbo_0314,0.43200000000000005,agieval,BLZ_240312,[]
118
+ vicuna_33b,0.373,agieval,BLZ_240312,[]
119
+ starling_lm_7b_alpha,0.401,agieval,BLZ_240312,[]
120
+ llama_2_70b_chat,0.45,agieval,BLZ_240312,[]
121
+ openhermes_2.5_mistral_7b,0.43,agieval,BLZ_240312,[]
122
+ openchat_3.5,0.42700000000000005,agieval,BLZ_240312,[]
123
+ solar_10.7b_instruct_v1.0,0.47600000000000003,agieval,BLZ_240312,[]
124
+ dolphin_2.2.1_mistral_7b,0.392,agieval,BLZ_240312,[]
125
+ zephyr_7b_beta,0.406,agieval,BLZ_240312,[]
126
+ llama_2_13b_chat,0.336,agieval,BLZ_240312,[]
127
+ vicuna_13b,0.368,agieval,BLZ_240312,[]
128
+ zephyr_7b_alpha,0.38,agieval,BLZ_240312,[]
129
+ qwen_14b_chat,0.396,agieval,BLZ_240312,[]
130
+ llama_2_7b_chat,0.29600000000000004,agieval,BLZ_240312,[]
131
+ mistral_7b_instruct_v0.1,0.335,agieval,BLZ_240312,[]
132
+ vicuna_7b,0.314,agieval,BLZ_240312,[]
133
+ chatglm3_6b,0.414,agieval,BLZ_240312,[]
134
+ chatglm_6b,0.325,agieval,BLZ_240312,[]
135
+ llama_13b,0.205,agieval,BLZ_240312,[]
136
+ gpt_4_0314,0.963,arc_c,BLZ_240312,[]
137
+ mistral_medium,0.899,arc_c,BLZ_240312,[]
138
+ mixtral_8x7b_instruct_v0.1,0.7021999999999999,arc_c,BLZ_240312,[]
139
+ yi_34b_chat,0.6544,arc_c,BLZ_240312,[]
140
+ gpt_3.5_turbo_0314,0.855,arc_c,BLZ_240312,[]
141
+ wizardlm_70b_v1.0,0.6544,arc_c,BLZ_240312,[]
142
+ tulu_2_dpo_70b,0.721,arc_c,BLZ_240312,[]
143
+ vicuna_33b,0.6212,arc_c,BLZ_240312,[]
144
+ starling_lm_7b_alpha,0.6382,arc_c,BLZ_240312,[]
145
+ llama_2_70b_chat,0.6459,arc_c,BLZ_240312,[]
146
+ openhermes_2.5_mistral_7b,0.6493000000000001,arc_c,BLZ_240312,[]
147
+ openchat_3.5,0.6391,arc_c,BLZ_240312,[]
148
+ solar_10.7b_instruct_v1.0,0.7108,arc_c,BLZ_240312,[]
149
+ dolphin_2.2.1_mistral_7b,0.6331,arc_c,BLZ_240312,[]
150
+ wizardlm_13b_v1.2,0.5904,arc_c,BLZ_240312,[]
151
+ zephyr_7b_beta,0.6203,arc_c,BLZ_240312,[]
152
+ mpt_30b_chat,0.5870000000000001,arc_c,BLZ_240312,[]
153
+ codellama_34b_instruct,0.5427000000000001,arc_c,BLZ_240312,[]
154
+ llama_2_13b_chat,0.5904,arc_c,BLZ_240312,[]
155
+ vicuna_13b,0.5708,arc_c,BLZ_240312,[]
156
+ zephyr_7b_alpha,0.6101,arc_c,BLZ_240312,[]
157
+ falcon_180b_chat,0.6945,arc_c,BLZ_240312,[]
158
+ llama_2_7b_chat,0.529,arc_c,BLZ_240312,[]
159
+ mistral_7b_instruct_v0.1,0.5452,arc_c,BLZ_240312,[]
160
+ vicuna_7b,0.5324,arc_c,BLZ_240312,[]
161
+ yi_34bx2_moe_60b,0.7108,arc_c,BLZ_240312,[]
162
+ gpt_4_1106_preview,0.977,alpacav1,BLZ_240312,[]
163
+ gpt_4_0314,0.9528,alpacav1,BLZ_240312,[]
164
+ gpt_4_0613,0.9528,alpacav1,BLZ_240312,[]
165
+ mistral_medium,0.9682999999999999,alpacav1,BLZ_240312,[]
166
+ claude_1,0.8839,alpacav1,BLZ_240312,[]
167
+ claude_2.0,0.9136,alpacav1,BLZ_240312,[]
168
+ gemini_pro_dev_api,0.7966,alpacav1,BLZ_240312,[]
169
+ claude_2.1,0.8708,alpacav1,BLZ_240312,[]
170
+ gpt_3.5_turbo_0613,0.8937,alpacav1,BLZ_240312,[]
171
+ mixtral_8x7b_instruct_v0.1,0.9478,alpacav1,BLZ_240312,[]
172
+ yi_34b_chat,0.9408,alpacav1,BLZ_240312,[]
173
+ gemini_pro,0.7966,alpacav1,BLZ_240312,[]
174
+ gpt_3.5_turbo_0314,0.8937,alpacav1,BLZ_240312,[]
175
+ tulu_2_dpo_70b,0.9503,alpacav1,BLZ_240312,[]
176
+ vicuna_33b,0.8898999999999999,alpacav1,BLZ_240312,[]
177
+ starling_lm_7b_alpha,0.9198999999999999,alpacav1,BLZ_240312,[]
178
+ llama_2_70b_chat,0.9266,alpacav1,BLZ_240312,[]
179
+ openchat_3.5,0.8851,alpacav1,BLZ_240312,[]
180
+ gpt_3.5_turbo_1106,0.8626,alpacav1,BLZ_240312,[]
181
+ wizardlm_13b_v1.2,0.8917,alpacav1,BLZ_240312,[]
182
+ zephyr_7b_beta,0.9059999999999999,alpacav1,BLZ_240312,[]
183
+ llama_2_13b_chat,0.8109000000000001,alpacav1,BLZ_240312,[]
184
+ zephyr_7b_alpha,0.8576,alpacav1,BLZ_240312,[]
185
+ guanaco_33b,0.6596,alpacav1,BLZ_240312,[]
186
+ llama_2_7b_chat,0.7137,alpacav1,BLZ_240312,[]
187
+ chatglm2_6b,0.47130000000000005,alpacav1,BLZ_240312,[]
188
+ openassistant_pythia_12b,0.2596,alpacav1,BLZ_240312,[]
189
+ gpt_4_1106_preview,0.5,alpacav2,BLZ_240312,[]
190
+ gpt_4_0314,0.221,alpacav2,BLZ_240312,[]
191
+ gpt_4_0613,0.158,alpacav2,BLZ_240312,[]
192
+ mistral_medium,0.21899999999999997,alpacav2,BLZ_240312,[]
193
+ claude_1,0.17,alpacav2,BLZ_240312,[]
194
+ claude_2.0,0.172,alpacav2,BLZ_240312,[]
195
+ gemini_pro_dev_api,0.16899999999999998,alpacav2,BLZ_240312,[]
196
+ claude_2.1,0.157,alpacav2,BLZ_240312,[]
197
+ gpt_3.5_turbo_0613,0.141,alpacav2,BLZ_240312,[]
198
+ mixtral_8x7b_instruct_v0.1,0.183,alpacav2,BLZ_240312,[]
199
+ yi_34b_chat,0.297,alpacav2,BLZ_240312,[]
200
+ gemini_pro,0.16899999999999998,alpacav2,BLZ_240312,[]
201
+ claude_instant_1,0.161,alpacav2,BLZ_240312,[]
202
+ gpt_3.5_turbo_0314,0.096,alpacav2,BLZ_240312,[]
203
+ wizardlm_70b_v1.0,0.14400000000000002,alpacav2,BLZ_240312,[]
204
+ tulu_2_dpo_70b,0.16,alpacav2,BLZ_240312,[]
205
+ vicuna_33b,0.127,alpacav2,BLZ_240312,[]
206
+ starling_lm_7b_alpha,0.142,alpacav2,BLZ_240312,[]
207
+ deepseek_llm_67b_chat,0.121,alpacav2,BLZ_240312,[]
208
+ llama_2_70b_chat,0.139,alpacav2,BLZ_240312,[]
209
+ openhermes_2.5_mistral_7b,0.10300000000000001,alpacav2,BLZ_240312,[]
210
+ gpt_3.5_turbo_1106,0.092,alpacav2,BLZ_240312,[]
211
+ dolphin_2.2.1_mistral_7b,0.09,alpacav2,BLZ_240312,[]
212
+ wizardlm_13b_v1.2,0.12,alpacav2,BLZ_240312,[]
213
+ zephyr_7b_beta,0.11,alpacav2,BLZ_240312,[]
214
+ llama_2_13b_chat,0.077,alpacav2,BLZ_240312,[]
215
+ vicuna_13b,0.067,alpacav2,BLZ_240312,[]
216
+ zephyr_7b_alpha,0.084,alpacav2,BLZ_240312,[]
217
+ qwen_14b_chat,0.075,alpacav2,BLZ_240312,[]
218
+ guanaco_33b,0.05,alpacav2,BLZ_240312,[]
219
+ llama_2_7b_chat,0.0496,alpacav2,BLZ_240312,[]
220
+ vicuna_7b,0.048,alpacav2,BLZ_240312,[]
221
+ chatglm2_6b,0.027999999999999997,alpacav2,BLZ_240312,[]
222
+ openassistant_pythia_12b,0.018000000000000002,alpacav2,BLZ_240312,[]
223
+ gpt_4_1106_preview,0.32799999999999996,alpacaeval2_lc,BLZ_240312,[]
224
+ gpt_4_0314,0.21600000000000003,alpacaeval2_lc,BLZ_240312,[]
225
+ gpt_4_0613,0.18600000000000003,alpacaeval2_lc,BLZ_240312,[]
226
+ mistral_medium,0.196,alpacaeval2_lc,BLZ_240312,[]
227
+ claude_1,0.21100000000000002,alpacaeval2_lc,BLZ_240312,[]
228
+ claude_2.0,0.21600000000000003,alpacaeval2_lc,BLZ_240312,[]
229
+ gemini_pro_dev_api,0.172,alpacaeval2_lc,BLZ_240312,[]
230
+ claude_2.1,0.193,alpacaeval2_lc,BLZ_240312,[]
231
+ gpt_3.5_turbo_0613,0.14300000000000002,alpacaeval2_lc,BLZ_240312,[]
232
+ mixtral_8x7b_instruct_v0.1,0.168,alpacaeval2_lc,BLZ_240312,[]
233
+ yi_34b_chat,0.188,alpacaeval2_lc,BLZ_240312,[]
234
+ claude_instant_1,0.195,alpacaeval2_lc,BLZ_240312,[]
235
+ gpt_3.5_turbo_0314,0.156,alpacaeval2_lc,BLZ_240312,[]
236
+ wizardlm_70b_v1.0,0.125,alpacaeval2_lc,BLZ_240312,[]
237
+ tulu_2_dpo_70b,0.151,alpacaeval2_lc,BLZ_240312,[]
238
+ vicuna_33b,0.115,alpacaeval2_lc,BLZ_240312,[]
239
+ starling_lm_7b_alpha,0.10099999999999999,alpacaeval2_lc,BLZ_240312,[]
240
+ deepseek_llm_67b_chat,0.141,alpacaeval2_lc,BLZ_240312,[]
241
+ llama_2_70b_chat,0.10400000000000001,alpacaeval2_lc,BLZ_240312,[]
242
+ openhermes_2.5_mistral_7b,0.126,alpacaeval2_lc,BLZ_240312,[]
243
+ gpt_3.5_turbo_1106,0.155,alpacaeval2_lc,BLZ_240312,[]
244
+ dolphin_2.2.1_mistral_7b,0.10800000000000001,alpacaeval2_lc,BLZ_240312,[]
245
+ wizardlm_13b_v1.2,0.099,alpacaeval2_lc,BLZ_240312,[]
246
+ zephyr_7b_beta,0.102,alpacaeval2_lc,BLZ_240312,[]
247
+ llama_2_13b_chat,0.068,alpacaeval2_lc,BLZ_240312,[]
248
+ vicuna_13b,0.085,alpacaeval2_lc,BLZ_240312,[]
249
+ zephyr_7b_alpha,0.086,alpacaeval2_lc,BLZ_240312,[]
250
+ qwen_14b_chat,0.1,alpacaeval2_lc,BLZ_240312,[]
251
+ llama_2_7b_chat,0.045,alpacaeval2_lc,BLZ_240312,[]
252
+ vicuna_7b,0.06,alpacaeval2_lc,BLZ_240312,[]
253
+ gpt_4_0125_preview,1.0,arena_elo,BLZ_240312,[]
254
+ gpt_4_1106_preview,0.9992019154030327,arena_elo,BLZ_240312,[]
255
+ bard_gemini_pro,0.9768555466879489,arena_elo,BLZ_240312,[]
256
+ gpt_4_0314,0.9497206703910615,arena_elo,BLZ_240312,[]
257
+ gpt_4_0613,0.9273743016759777,arena_elo,BLZ_240312,[]
258
+ mistral_medium,0.9177972865123704,arena_elo,BLZ_240312,[]
259
+ claude_1,0.9169992019154031,arena_elo,BLZ_240312,[]
260
+ claude_2.0,0.9034317637669593,arena_elo,BLZ_240312,[]
261
+ gemini_pro_dev_api,0.8938547486033519,arena_elo,BLZ_240312,[]
262
+ claude_2.1,0.8930566640063847,arena_elo,BLZ_240312,[]
263
+ gpt_3.5_turbo_0613,0.8922585794094174,arena_elo,BLZ_240312,[]
264
+ mixtral_8x7b_instruct_v0.1,0.8922585794094174,arena_elo,BLZ_240312,[]
265
+ yi_34b_chat,0.8898643256185156,arena_elo,BLZ_240312,[]
266
+ gemini_pro,0.8890662410215483,arena_elo,BLZ_240312,[]
267
+ claude_instant_1,0.8850758180367119,arena_elo,BLZ_240312,[]
268
+ gpt_3.5_turbo_0314,0.8818834796488427,arena_elo,BLZ_240312,[]
269
+ wizardlm_70b_v1.0,0.8818834796488427,arena_elo,BLZ_240312,[]
270
+ tulu_2_dpo_70b,0.8810853950518756,arena_elo,BLZ_240312,[]
271
+ vicuna_33b,0.8723064644852354,arena_elo,BLZ_240312,[]
272
+ starling_lm_7b_alpha,0.8699122106943336,arena_elo,BLZ_240312,[]
273
+ deepseek_llm_67b_chat,0.8635275339185954,arena_elo,BLZ_240312,[]
274
+ llama_2_70b_chat,0.8635275339185954,arena_elo,BLZ_240312,[]
275
+ nv_llama2_70b_steerlm_chat,0.8603351955307262,arena_elo,BLZ_240312,[]
276
+ openhermes_2.5_mistral_7b,0.8603351955307262,arena_elo,BLZ_240312,[]
277
+ openchat_3.5,0.8587390263367917,arena_elo,BLZ_240312,[]
278
+ pplx_70b_online,0.8587390263367917,arena_elo,BLZ_240312,[]
279
+ gpt_3.5_turbo_1106,0.8547486033519553,arena_elo,BLZ_240312,[]
280
+ solar_10.7b_instruct_v1.0,0.8499600957701516,arena_elo,BLZ_240312,[]
281
+ dolphin_2.2.1_mistral_7b,0.8499600957701516,arena_elo,BLZ_240312,[]
282
+ wizardlm_13b_v1.2,0.8443735035913806,arena_elo,BLZ_240312,[]
283
+ zephyr_7b_beta,0.8387869114126097,arena_elo,BLZ_240312,[]
284
+ mpt_30b_chat,0.8332003192338387,arena_elo,BLZ_240312,[]
285
+ codellama_34b_instruct,0.8324022346368715,arena_elo,BLZ_240312,[]
286
+ llama_2_13b_chat,0.8316041500399042,arena_elo,BLZ_240312,[]
287
+ vicuna_13b,0.8300079808459697,arena_elo,BLZ_240312,[]
288
+ pplx_7b_online,0.8284118116520351,arena_elo,BLZ_240312,[]
289
+ zephyr_7b_alpha,0.8276137270550679,arena_elo,BLZ_240312,[]
290
+ qwen_14b_chat,0.825219473264166,arena_elo,BLZ_240312,[]
291
+ falcon_180b_chat,0.8236233040702314,arena_elo,BLZ_240312,[]
292
+ guanaco_33b,0.8236233040702314,arena_elo,BLZ_240312,[]
293
+ llama_2_7b_chat,0.8172386272944933,arena_elo,BLZ_240312,[]
294
+ stripedhyena_nous_7b,0.8140462889066241,arena_elo,BLZ_240312,[]
295
+ mistral_7b_instruct_v0.1,0.8028731045490822,arena_elo,BLZ_240312,[]
296
+ palm_chat_bison_001,0.8028731045490822,arena_elo,BLZ_240312,[]
297
+ vicuna_7b,0.8020750199521149,arena_elo,BLZ_240312,[]
298
+ koala_13b,0.770949720670391,arena_elo,BLZ_240312,[]
299
+ chatglm3_6b,0.7661612130885874,arena_elo,BLZ_240312,[]
300
+ gpt4all_13b_snoozy,0.74780526735834,arena_elo,BLZ_240312,[]
301
+ mpt_7b_chat,0.7430167597765364,arena_elo,BLZ_240312,[]
302
+ chatglm2_6b,0.7422186751795691,arena_elo,BLZ_240312,[]
303
+ rwkv_4_raven_14b,0.7382282521947326,arena_elo,BLZ_240312,[]
304
+ alpaca_13b,0.7214684756584198,arena_elo,BLZ_240312,[]
305
+ openassistant_pythia_12b,0.7158818834796489,arena_elo,BLZ_240312,[]
306
+ chatglm_6b,0.704708699122107,arena_elo,BLZ_240312,[]
307
+ fastchat_t5_3b,0.6975259377494014,arena_elo,BLZ_240312,[]
308
+ stablelm_tuned_alpha_7b,0.6743814844373504,arena_elo,BLZ_240312,[]
309
+ dolly_v2_12b,0.6568236233040702,arena_elo,BLZ_240312,[]
310
+ llama_13b,0.6384676775738228,arena_elo,BLZ_240312,[]
311
+ gpt_4_1106_preview,0.8390000000000001,bbh,BLZ_240312,[]
312
+ gpt_4_0314,0.867,bbh,BLZ_240312,[]
313
+ gpt_4_0613,0.867,bbh,BLZ_240312,[]
314
+ claude_1,0.6729999999999999,bbh,BLZ_240312,[]
315
+ gemini_pro_dev_api,0.6559999999999999,bbh,BLZ_240312,[]
316
+ gpt_3.5_turbo_0613,0.71,bbh,BLZ_240312,[]
317
+ mixtral_8x7b_instruct_v0.1,0.67,bbh,BLZ_240312,[]
318
+ yi_34b_chat,0.7170000000000001,bbh,BLZ_240312,[]
319
+ gemini_pro,0.6559999999999999,bbh,BLZ_240312,[]
320
+ tulu_2_dpo_70b,0.66,bbh,BLZ_240312,[]
321
+ vicuna_33b,0.52,bbh,BLZ_240312,[]
322
+ llama_2_70b_chat,0.608,bbh,BLZ_240312,[]
323
+ gpt_3.5_turbo_1106,0.71,bbh,BLZ_240312,[]
324
+ dolphin_2.2.1_mistral_7b,0.598,bbh,BLZ_240312,[]
325
+ llama_2_13b_chat,0.5820000000000001,bbh,BLZ_240312,[]
326
+ vicuna_13b,0.515,bbh,BLZ_240312,[]
327
+ qwen_14b_chat,0.537,bbh,BLZ_240312,[]
328
+ llama_2_7b_chat,0.35600000000000004,bbh,BLZ_240312,[]
329
+ mistral_7b_instruct_v0.1,0.5670000000000001,bbh,BLZ_240312,[]
330
+ vicuna_7b,0.434,bbh,BLZ_240312,[]
331
+ llama_13b,0.379,bbh,BLZ_240312,[]
332
+ gpt_4_1106_preview,0.8604999999999999,eq_benchv2,BLZ_240312,[]
333
+ gpt_4_0314,0.8573000000000001,eq_benchv2,BLZ_240312,[]
334
+ gpt_4_0613,0.8479000000000001,eq_benchv2,BLZ_240312,[]
335
+ mistral_medium,0.8256999999999999,eq_benchv2,BLZ_240312,[]
336
+ claude_1,0.7683,eq_benchv2,BLZ_240312,[]
337
+ claude_2.0,0.7289,eq_benchv2,BLZ_240312,[]
338
+ gemini_pro_dev_api,0.7508,eq_benchv2,BLZ_240312,[]
339
+ claude_2.1,0.7395999999999999,eq_benchv2,BLZ_240312,[]
340
+ gpt_3.5_turbo_0613,0.6934999999999999,eq_benchv2,BLZ_240312,[]
341
+ mixtral_8x7b_instruct_v0.1,0.7237,eq_benchv2,BLZ_240312,[]
342
+ yi_34b_chat,0.7162000000000001,eq_benchv2,BLZ_240312,[]
343
+ claude_instant_1,0.6904,eq_benchv2,BLZ_240312,[]
344
+ gpt_3.5_turbo_0314,0.7067,eq_benchv2,BLZ_240312,[]
345
+ wizardlm_70b_v1.0,0.7128,eq_benchv2,BLZ_240312,[]
346
+ tulu_2_dpo_70b,0.7663,eq_benchv2,BLZ_240312,[]
347
+ vicuna_33b,0.6707,eq_benchv2,BLZ_240312,[]
348
+ starling_lm_7b_alpha,0.7390000000000001,eq_benchv2,BLZ_240312,[]
349
+ deepseek_llm_67b_chat,0.7753,eq_benchv2,BLZ_240312,[]
350
+ llama_2_70b_chat,0.7359,eq_benchv2,BLZ_240312,[]
351
+ openhermes_2.5_mistral_7b,0.6689,eq_benchv2,BLZ_240312,[]
352
+ openchat_3.5,0.7218000000000001,eq_benchv2,BLZ_240312,[]
353
+ pplx_70b_online,0.6279,eq_benchv2,BLZ_240312,[]
354
+ gpt_3.5_turbo_1106,0.7173999999999999,eq_benchv2,BLZ_240312,[]
355
+ solar_10.7b_instruct_v1.0,0.7353000000000001,eq_benchv2,BLZ_240312,[]
356
+ dolphin_2.2.1_mistral_7b,0.6992,eq_benchv2,BLZ_240312,[]
357
+ wizardlm_13b_v1.2,0.6371,eq_benchv2,BLZ_240312,[]
358
+ zephyr_7b_beta,0.5832999999999999,eq_benchv2,BLZ_240312,[]
359
+ codellama_34b_instruct,0.4915,eq_benchv2,BLZ_240312,[]
360
+ llama_2_13b_chat,0.49119999999999997,eq_benchv2,BLZ_240312,[]
361
+ vicuna_13b,0.6739,eq_benchv2,BLZ_240312,[]
362
+ pplx_7b_online,0.4891,eq_benchv2,BLZ_240312,[]
363
+ zephyr_7b_alpha,0.5682,eq_benchv2,BLZ_240312,[]
364
+ qwen_14b_chat,0.6347,eq_benchv2,BLZ_240312,[]
365
+ falcon_180b_chat,0.5682,eq_benchv2,BLZ_240312,[]
366
+ guanaco_33b,0.3611,eq_benchv2,BLZ_240312,[]
367
+ llama_2_7b_chat,0.3632,eq_benchv2,BLZ_240312,[]
368
+ stripedhyena_nous_7b,0.5458,eq_benchv2,BLZ_240312,[]
369
+ mistral_7b_instruct_v0.1,0.5215,eq_benchv2,BLZ_240312,[]
370
+ yi_34bx2_moe_60b,0.7269,eq_benchv2,BLZ_240312,[]
371
+ mixtral_8x7b_instruct_v0.1,0.7641,gpt4all,BLZ_240312,[]
372
+ yi_34b_chat,0.7212999999999999,gpt4all,BLZ_240312,[]
373
+ starling_lm_7b_alpha,0.7272,gpt4all,BLZ_240312,[]
374
+ openhermes_2.5_mistral_7b,0.7312000000000001,gpt4all,BLZ_240312,[]
375
+ openchat_3.5,0.7292000000000001,gpt4all,BLZ_240312,[]
376
+ solar_10.7b_instruct_v1.0,0.7511,gpt4all,BLZ_240312,[]
377
+ dolphin_2.2.1_mistral_7b,0.7223999999999999,gpt4all,BLZ_240312,[]
378
+ zephyr_7b_beta,0.7182999999999999,gpt4all,BLZ_240312,[]
379
+ vicuna_13b,0.631,gpt4all,BLZ_240312,[]
380
+ zephyr_7b_alpha,0.7223999999999999,gpt4all,BLZ_240312,[]
381
+ mistral_7b_instruct_v0.1,0.6795,gpt4all,BLZ_240312,[]
382
+ vicuna_7b,0.61,gpt4all,BLZ_240312,[]
383
+ koala_13b,0.62,gpt4all,BLZ_240312,[]
384
+ gpt4all_13b_snoozy,0.653,gpt4all,BLZ_240312,[]
385
+ mpt_7b_chat,0.648,gpt4all,BLZ_240312,[]
386
+ openassistant_pythia_12b,0.61,gpt4all,BLZ_240312,[]
387
+ fastchat_t5_3b,0.537,gpt4all,BLZ_240312,[]
388
+ stablelm_tuned_alpha_7b,0.513,gpt4all,BLZ_240312,[]
389
+ llama_13b,0.63,gpt4all,BLZ_240312,[]
390
+ mixtral_8x7b_instruct_v0.1,0.7262000000000001,hugging_6,BLZ_240312,[]
391
+ yi_34b_chat,0.6531999999999999,hugging_6,BLZ_240312,[]
392
+ wizardlm_70b_v1.0,0.6125,hugging_6,BLZ_240312,[]
393
+ tulu_2_dpo_70b,0.7376999999999999,hugging_6,BLZ_240312,[]
394
+ vicuna_33b,0.585,hugging_6,BLZ_240312,[]
395
+ starling_lm_7b_alpha,0.6713,hugging_6,BLZ_240312,[]
396
+ llama_2_70b_chat,0.624,hugging_6,BLZ_240312,[]
397
+ openhermes_2.5_mistral_7b,0.6152000000000001,hugging_6,BLZ_240312,[]
398
+ openchat_3.5,0.6124,hugging_6,BLZ_240312,[]
399
+ solar_10.7b_instruct_v1.0,0.742,hugging_6,BLZ_240312,[]
400
+ dolphin_2.2.1_mistral_7b,0.6493000000000001,hugging_6,BLZ_240312,[]
401
+ wizardlm_13b_v1.2,0.5476,hugging_6,BLZ_240312,[]
402
+ zephyr_7b_beta,0.6195,hugging_6,BLZ_240312,[]
403
+ mpt_30b_chat,0.5538000000000001,hugging_6,BLZ_240312,[]
404
+ codellama_34b_instruct,0.5729,hugging_6,BLZ_240312,[]
405
+ llama_2_13b_chat,0.5490999999999999,hugging_6,BLZ_240312,[]
406
+ vicuna_13b,0.5539999999999999,hugging_6,BLZ_240312,[]
407
+ zephyr_7b_alpha,0.595,hugging_6,BLZ_240312,[]
408
+ falcon_180b_chat,0.6785,hugging_6,BLZ_240312,[]
409
+ llama_2_7b_chat,0.5074000000000001,hugging_6,BLZ_240312,[]
410
+ mistral_7b_instruct_v0.1,0.5496,hugging_6,BLZ_240312,[]
411
+ vicuna_7b,0.521,hugging_6,BLZ_240312,[]
412
+ yi_34bx2_moe_60b,0.7672,hugging_6,BLZ_240312,[]
413
+ gpt_4_0314,0.93,llmonitor,BLZ_240312,[]
414
+ gpt_4_0613,0.89,llmonitor,BLZ_240312,[]
415
+ claude_1,0.66,llmonitor,BLZ_240312,[]
416
+ claude_2.0,0.68,llmonitor,BLZ_240312,[]
417
+ gpt_3.5_turbo_0613,0.81,llmonitor,BLZ_240312,[]
418
+ claude_instant_1,0.6,llmonitor,BLZ_240312,[]
419
+ gpt_3.5_turbo_0314,0.79,llmonitor,BLZ_240312,[]
420
+ llama_2_70b_chat,0.6,llmonitor,BLZ_240312,[]
421
+ mpt_30b_chat,0.4,llmonitor,BLZ_240312,[]
422
+ codellama_34b_instruct,0.34,llmonitor,BLZ_240312,[]
423
+ llama_2_13b_chat,0.5,llmonitor,BLZ_240312,[]
424
+ vicuna_13b,0.5,llmonitor,BLZ_240312,[]
425
+ falcon_180b_chat,0.67,llmonitor,BLZ_240312,[]
426
+ guanaco_33b,0.43,llmonitor,BLZ_240312,[]
427
+ llama_2_7b_chat,0.5,llmonitor,BLZ_240312,[]
428
+ mistral_7b_instruct_v0.1,0.57,llmonitor,BLZ_240312,[]
429
+ palm_chat_bison_001,0.57,llmonitor,BLZ_240312,[]
430
+ vicuna_7b,0.41,llmonitor,BLZ_240312,[]
431
+ koala_13b,0.31,llmonitor,BLZ_240312,[]
432
+ mpt_7b_chat,0.43,llmonitor,BLZ_240312,[]
433
+ dolly_v2_12b,0.23,llmonitor,BLZ_240312,[]
434
+ mistral_medium,0.654,magi,BLZ_240312,[]
435
+ gemini_pro_dev_api,0.528,magi,BLZ_240312,[]
436
+ gpt_3.5_turbo_0613,0.455,magi,BLZ_240312,[]
437
+ mixtral_8x7b_instruct_v0.1,0.49560000000000004,magi,BLZ_240312,[]
438
+ yi_34b_chat,0.5821999999999999,magi,BLZ_240312,[]
439
+ gpt_3.5_turbo_0314,0.512,magi,BLZ_240312,[]
440
+ wizardlm_70b_v1.0,0.4476,magi,BLZ_240312,[]
441
+ tulu_2_dpo_70b,0.5212,magi,BLZ_240312,[]
442
+ vicuna_33b,0.3837,magi,BLZ_240312,[]
443
+ starling_lm_7b_alpha,0.4304,magi,BLZ_240312,[]
444
+ deepseek_llm_67b_chat,0.5946,magi,BLZ_240312,[]
445
+ llama_2_70b_chat,0.39899999999999997,magi,BLZ_240312,[]
446
+ openhermes_2.5_mistral_7b,0.4236,magi,BLZ_240312,[]
447
+ openchat_3.5,0.42200000000000004,magi,BLZ_240312,[]
448
+ gpt_3.5_turbo_1106,0.462,magi,BLZ_240312,[]
449
+ solar_10.7b_instruct_v1.0,0.4693,magi,BLZ_240312,[]
450
+ dolphin_2.2.1_mistral_7b,0.3782,magi,BLZ_240312,[]
451
+ wizardlm_13b_v1.2,0.3678,magi,BLZ_240312,[]
452
+ zephyr_7b_beta,0.4042,magi,BLZ_240312,[]
453
+ llama_2_13b_chat,0.37170000000000003,magi,BLZ_240312,[]
454
+ vicuna_13b,0.36560000000000004,magi,BLZ_240312,[]
455
+ zephyr_7b_alpha,0.39899999999999997,magi,BLZ_240312,[]
456
+ qwen_14b_chat,0.4535,magi,BLZ_240312,[]
457
+ guanaco_33b,0.38659999999999994,magi,BLZ_240312,[]
458
+ llama_2_7b_chat,0.35969999999999996,magi,BLZ_240312,[]
459
+ mistral_7b_instruct_v0.1,0.3704,magi,BLZ_240312,[]
460
+ gpt_4_1106_preview,0.805,mmlu,BLZ_240312,[]
461
+ gpt_4_0314,0.8640000000000001,mmlu,BLZ_240312,[]
462
+ mistral_medium,0.753,mmlu,BLZ_240312,[]
463
+ claude_1,0.77,mmlu,BLZ_240312,[]
464
+ claude_2.0,0.785,mmlu,BLZ_240312,[]
465
+ gemini_pro_dev_api,0.718,mmlu,BLZ_240312,[]
466
+ mixtral_8x7b_instruct_v0.1,0.706,mmlu,BLZ_240312,[]
467
+ yi_34b_chat,0.735,mmlu,BLZ_240312,[]
468
+ gemini_pro,0.718,mmlu,BLZ_240312,[]
469
+ claude_instant_1,0.7340000000000001,mmlu,BLZ_240312,[]
470
+ gpt_3.5_turbo_0314,0.7,mmlu,BLZ_240312,[]
471
+ wizardlm_70b_v1.0,0.637,mmlu,BLZ_240312,[]
472
+ tulu_2_dpo_70b,0.698,mmlu,BLZ_240312,[]
473
+ vicuna_33b,0.5920000000000001,mmlu,BLZ_240312,[]
474
+ starling_lm_7b_alpha,0.639,mmlu,BLZ_240312,[]
475
+ deepseek_llm_67b_chat,0.713,mmlu,BLZ_240312,[]
476
+ llama_2_70b_chat,0.63,mmlu,BLZ_240312,[]
477
+ nv_llama2_70b_steerlm_chat,0.685,mmlu,BLZ_240312,[]
478
+ openhermes_2.5_mistral_7b,0.638,mmlu,BLZ_240312,[]
479
+ openchat_3.5,0.643,mmlu,BLZ_240312,[]
480
+ gpt_3.5_turbo_1106,0.6779999999999999,mmlu,BLZ_240312,[]
481
+ solar_10.7b_instruct_v1.0,0.662,mmlu,BLZ_240312,[]
482
+ dolphin_2.2.1_mistral_7b,0.632,mmlu,BLZ_240312,[]
483
+ wizardlm_13b_v1.2,0.527,mmlu,BLZ_240312,[]
484
+ zephyr_7b_beta,0.614,mmlu,BLZ_240312,[]
485
+ mpt_30b_chat,0.504,mmlu,BLZ_240312,[]
486
+ codellama_34b_instruct,0.537,mmlu,BLZ_240312,[]
487
+ llama_2_13b_chat,0.536,mmlu,BLZ_240312,[]
488
+ vicuna_13b,0.5579999999999999,mmlu,BLZ_240312,[]
489
+ zephyr_7b_alpha,0.614,mmlu,BLZ_240312,[]
490
+ qwen_14b_chat,0.665,mmlu,BLZ_240312,[]
491
+ falcon_180b_chat,0.68,mmlu,BLZ_240312,[]
492
+ guanaco_33b,0.5760000000000001,mmlu,BLZ_240312,[]
493
+ llama_2_7b_chat,0.45799999999999996,mmlu,BLZ_240312,[]
494
+ mistral_7b_instruct_v0.1,0.5539999999999999,mmlu,BLZ_240312,[]
495
+ vicuna_7b,0.51,mmlu,BLZ_240312,[]
496
+ koala_13b,0.447,mmlu,BLZ_240312,[]
497
+ gpt4all_13b_snoozy,0.43,mmlu,BLZ_240312,[]
498
+ mpt_7b_chat,0.32,mmlu,BLZ_240312,[]
499
+ chatglm2_6b,0.455,mmlu,BLZ_240312,[]
500
+ rwkv_4_raven_14b,0.256,mmlu,BLZ_240312,[]
501
+ alpaca_13b,0.48100000000000004,mmlu,BLZ_240312,[]
502
+ openassistant_pythia_12b,0.27,mmlu,BLZ_240312,[]
503
+ chatglm_6b,0.361,mmlu,BLZ_240312,[]
504
+ fastchat_t5_3b,0.47700000000000004,mmlu,BLZ_240312,[]
505
+ stablelm_tuned_alpha_7b,0.244,mmlu,BLZ_240312,[]
506
+ dolly_v2_12b,0.257,mmlu,BLZ_240312,[]
507
+ llama_13b,0.47,mmlu,BLZ_240312,[]
508
+ yi_34bx2_moe_60b,0.775,mmlu,BLZ_240312,[]
509
+ gpt_4_0125_preview,0.0929,mt_bench,BLZ_240312,[]
510
+ gpt_4_1106_preview,0.0932,mt_bench,BLZ_240312,[]
511
+ gpt_4_0314,0.08960000000000001,mt_bench,BLZ_240312,[]
512
+ gpt_4_0613,0.09179999999999999,mt_bench,BLZ_240312,[]
513
+ mistral_medium,0.0861,mt_bench,BLZ_240312,[]
514
+ claude_1,0.079,mt_bench,BLZ_240312,[]
515
+ claude_2.0,0.0806,mt_bench,BLZ_240312,[]
516
+ gemini_pro_dev_api,0.08039999999999999,mt_bench,BLZ_240312,[]
517
+ claude_2.1,0.0818,mt_bench,BLZ_240312,[]
518
+ gpt_3.5_turbo_0613,0.0839,mt_bench,BLZ_240312,[]
519
+ mixtral_8x7b_instruct_v0.1,0.083,mt_bench,BLZ_240312,[]
520
+ yi_34b_chat,0.07769999999999999,mt_bench,BLZ_240312,[]
521
+ gemini_pro,0.08039999999999999,mt_bench,BLZ_240312,[]
522
+ claude_instant_1,0.0785,mt_bench,BLZ_240312,[]
523
+ gpt_3.5_turbo_0314,0.0794,mt_bench,BLZ_240312,[]
524
+ wizardlm_70b_v1.0,0.0771,mt_bench,BLZ_240312,[]
525
+ tulu_2_dpo_70b,0.0789,mt_bench,BLZ_240312,[]
526
+ vicuna_33b,0.0712,mt_bench,BLZ_240312,[]
527
+ starling_lm_7b_alpha,0.0809,mt_bench,BLZ_240312,[]
528
+ deepseek_llm_67b_chat,0.08529999999999999,mt_bench,BLZ_240312,[]
529
+ llama_2_70b_chat,0.06860000000000001,mt_bench,BLZ_240312,[]
530
+ nv_llama2_70b_steerlm_chat,0.0754,mt_bench,BLZ_240312,[]
531
+ openhermes_2.5_mistral_7b,0.07690000000000001,mt_bench,BLZ_240312,[]
532
+ openchat_3.5,0.0781,mt_bench,BLZ_240312,[]
533
+ pplx_70b_online,0.0588,mt_bench,BLZ_240312,[]
534
+ gpt_3.5_turbo_1106,0.0832,mt_bench,BLZ_240312,[]
535
+ solar_10.7b_instruct_v1.0,0.0758,mt_bench,BLZ_240312,[]
536
+ wizardlm_13b_v1.2,0.07200000000000001,mt_bench,BLZ_240312,[]
537
+ zephyr_7b_beta,0.07339999999999999,mt_bench,BLZ_240312,[]
538
+ mpt_30b_chat,0.0639,mt_bench,BLZ_240312,[]
539
+ llama_2_13b_chat,0.0665,mt_bench,BLZ_240312,[]
540
+ vicuna_13b,0.06570000000000001,mt_bench,BLZ_240312,[]
541
+ zephyr_7b_alpha,0.0688,mt_bench,BLZ_240312,[]
542
+ qwen_14b_chat,0.0696,mt_bench,BLZ_240312,[]
543
+ guanaco_33b,0.0653,mt_bench,BLZ_240312,[]
544
+ llama_2_7b_chat,0.06269999999999999,mt_bench,BLZ_240312,[]
545
+ mistral_7b_instruct_v0.1,0.0684,mt_bench,BLZ_240312,[]
546
+ palm_chat_bison_001,0.064,mt_bench,BLZ_240312,[]
547
+ vicuna_7b,0.0617,mt_bench,BLZ_240312,[]
548
+ koala_13b,0.0535,mt_bench,BLZ_240312,[]
549
+ gpt4all_13b_snoozy,0.0541,mt_bench,BLZ_240312,[]
550
+ mpt_7b_chat,0.0542,mt_bench,BLZ_240312,[]
551
+ chatglm2_6b,0.0496,mt_bench,BLZ_240312,[]
552
+ rwkv_4_raven_14b,0.0398,mt_bench,BLZ_240312,[]
553
+ alpaca_13b,0.0453,mt_bench,BLZ_240312,[]
554
+ openassistant_pythia_12b,0.0432,mt_bench,BLZ_240312,[]
555
+ chatglm_6b,0.045,mt_bench,BLZ_240312,[]
556
+ fastchat_t5_3b,0.0304,mt_bench,BLZ_240312,[]
557
+ stablelm_tuned_alpha_7b,0.0275,mt_bench,BLZ_240312,[]
558
+ dolly_v2_12b,0.032799999999999996,mt_bench,BLZ_240312,[]
559
+ llama_13b,0.026099999999999998,mt_bench,BLZ_240312,[]
560
+ gpt_4_0613,0.735,mmlu,helm_lite_240610,[]
561
+ llama_3_70b,0.695,mmlu,helm_lite_240610,[]
562
+ mixtral_8x22b,0.701,mmlu,helm_lite_240610,[]
563
+ palmyra_x_v3_72b,0.702,mmlu,helm_lite_240610,[]
564
+ gpt_4_turbo_1106_preview,0.699,mmlu,helm_lite_240610,[]
565
+ palm_2_unicorn,0.702,mmlu,helm_lite_240610,[]
566
+ claude_3_opus_20240229,0.768,mmlu,helm_lite_240610,[]
567
+ qwen1.5_72b,0.647,mmlu,helm_lite_240610,[]
568
+ palmyra_x_v2_33b,0.621,mmlu,helm_lite_240610,[]
569
+ yi_34b,0.65,mmlu,helm_lite_240610,[]
570
+ qwen1.5_32b,0.628,mmlu,helm_lite_240610,[]
571
+ claude_v1.3,0.631,mmlu,helm_lite_240610,[]
572
+ mixtral_8x7b_32k_seqlen,0.649,mmlu,helm_lite_240610,[]
573
+ palm_2_bison,0.608,mmlu,helm_lite_240610,[]
574
+ claude_2.0,0.639,mmlu,helm_lite_240610,[]
575
+ deepseek_llm_chat_67b,0.641,mmlu,helm_lite_240610,[]
576
+ llama_2_70b,0.58,mmlu,helm_lite_240610,[]
577
+ claude_2.1,0.643,mmlu,helm_lite_240610,[]
578
+ gpt_3.5_text_davinci_003,0.555,mmlu,helm_lite_240610,[]
579
+ qwen1.5_14b,0.626,mmlu,helm_lite_240610,[]
580
+ claude_instant_1.2,0.631,mmlu,helm_lite_240610,[]
581
+ llama_3_8b,0.602,mmlu,helm_lite_240610,[]
582
+ gpt_3.5_turbo_0613,0.614,mmlu,helm_lite_240610,[]
583
+ gemma_7b,0.571,mmlu,helm_lite_240610,[]
584
+ claude_3_sonnet_20240229,0.652,mmlu,helm_lite_240610,[]
585
+ gpt_3.5_text_davinci_002,0.568,mmlu,helm_lite_240610,[]
586
+ llama_65b,0.584,mmlu,helm_lite_240610,[]
587
+ mistral_large_2402,0.638,mmlu,helm_lite_240610,[]
588
+ cohere_command,0.525,mmlu,helm_lite_240610,[]
589
+ dbrx_instructruct,0.643,mmlu,helm_lite_240610,[]
590
+ mistral_v0.1_7b,0.584,mmlu,helm_lite_240610,[]
591
+ mistral_small_2402,0.593,mmlu,helm_lite_240610,[]
592
+ mistral_medium_2312,0.618,mmlu,helm_lite_240610,[]
593
+ qwen1.5_7b,0.569,mmlu,helm_lite_240610,[]
594
+ claude_3_haiku_20240307,0.662,mmlu,helm_lite_240610,[]
595
+ yi_6b,0.53,mmlu,helm_lite_240610,[]
596
+ llama_2_13b,0.505,mmlu,helm_lite_240610,[]
597
+ jurassic_2_jumbo_178b,0.483,mmlu,helm_lite_240610,[]
598
+ falcon_40b,0.507,mmlu,helm_lite_240610,[]
599
+ phi_2,0.518,mmlu,helm_lite_240610,[]
600
+ jurassic_2_grande_17b,0.471,mmlu,helm_lite_240610,[]
601
+ llama_2_7b,0.425,mmlu,helm_lite_240610,[]
602
+ luminous_supreme_70b,0.316,mmlu,helm_lite_240610,[]
603
+ cohere_command_light,0.386,mmlu,helm_lite_240610,[]
604
+ luminous_extended_30b,0.248,mmlu,helm_lite_240610,[]
605
+ falcon_7b,0.288,mmlu,helm_lite_240610,[]
606
+ olmo_7b,0.305,mmlu,helm_lite_240610,[]
607
+ luminous_base_13b,0.243,mmlu,helm_lite_240610,[]
608
+ llama_2_70b,0.582,mmlu,helm_classic_240130,[]
609
+ llama_65b,0.584,mmlu,helm_classic_240130,[]
610
+ text_davinci_002,0.568,mmlu,helm_classic_240130,[]
611
+ mistral_v0.1_7b,0.572,mmlu,helm_classic_240130,[]
612
+ cohere_command_beta_52.4b,0.452,mmlu,helm_classic_240130,[]
613
+ text_davinci_003,0.569,mmlu,helm_classic_240130,[]
614
+ jurassic_2_jumbo_178b,0.48,mmlu,helm_classic_240130,[]
615
+ llama_2_13b,0.507,mmlu,helm_classic_240130,[]
616
+ tnlg_v2_530b,0.469,mmlu,helm_classic_240130,[]
617
+ gpt_3.5_turbo_0613,0.391,mmlu,helm_classic_240130,[]
618
+ llama_30b,0.531,mmlu,helm_classic_240130,[]
619
+ anthropic_lm_v4_s3_52b,0.481,mmlu,helm_classic_240130,[]
620
+ gpt_3.5_turbo_0301,0.59,mmlu,helm_classic_240130,[]
621
+ jurassic_2_grande_17b,0.475,mmlu,helm_classic_240130,[]
622
+ palmyra_x_43b,0.609,mmlu,helm_classic_240130,[]
623
+ falcon_40b,0.509,mmlu,helm_classic_240130,[]
624
+ falcon_instruct_40b,0.497,mmlu,helm_classic_240130,[]
625
+ mpt_instruct_30b,0.444,mmlu,helm_classic_240130,[]
626
+ mpt_30b,0.437,mmlu,helm_classic_240130,[]
627
+ j1_grande_v2_beta_17b,0.445,mmlu,helm_classic_240130,[]
628
+ vicuna_v1.3_13b,0.462,mmlu,helm_classic_240130,[]
629
+ cohere_command_beta_6.1b,0.406,mmlu,helm_classic_240130,[]
630
+ cohere_xlarge_v20221108_52.4b,0.382,mmlu,helm_classic_240130,[]
631
+ luminous_supreme_70b,0.38,mmlu,helm_classic_240130,[]
632
+ vicuna_v1.3_7b,0.434,mmlu,helm_classic_240130,[]
633
+ opt_175b,0.318,mmlu,helm_classic_240130,[]
634
+ llama_2_7b,0.431,mmlu,helm_classic_240130,[]
635
+ llama_13b,0.422,mmlu,helm_classic_240130,[]
636
+ instructpalmyra_30b,0.403,mmlu,helm_classic_240130,[]
637
+ cohere_xlarge_v20220609_52.4b,0.353,mmlu,helm_classic_240130,[]
638
+ jurassic_2_large_7.5b,0.339,mmlu,helm_classic_240130,[]
639
+ davinci_175b,0.422,mmlu,helm_classic_240130,[]
640
+ llama_7b,0.321,mmlu,helm_classic_240130,[]
641
+ redpajama_incite_instruct_7b,0.363,mmlu,helm_classic_240130,[]
642
+ j1_jumbo_v1_178b,0.259,mmlu,helm_classic_240130,[]
643
+ glm_130b,0.344,mmlu,helm_classic_240130,[]
644
+ luminous_extended_30b,0.321,mmlu,helm_classic_240130,[]
645
+ opt_66b,0.276,mmlu,helm_classic_240130,[]
646
+ bloom_176b,0.299,mmlu,helm_classic_240130,[]
647
+ j1_grande_v1_17b,0.27,mmlu,helm_classic_240130,[]
648
+ alpaca_7b,0.385,mmlu,helm_classic_240130,[]
649
+ falcon_7b,0.286,mmlu,helm_classic_240130,[]
650
+ redpajama_incite_base_7b,0.302,mmlu,helm_classic_240130,[]
651
+ cohere_large_v20220720_13.1b,0.324,mmlu,helm_classic_240130,[]
652
+ redpajama_incite_instruct_v1_3b,0.257,mmlu,helm_classic_240130,[]
653
+ text_curie_001,0.237,mmlu,helm_classic_240130,[]
654
+ gpt_neox_20b,0.276,mmlu,helm_classic_240130,[]
655
+ luminous_base_13b,0.27,mmlu,helm_classic_240130,[]
656
+ cohere_medium_v20221108_6.1b,0.254,mmlu,helm_classic_240130,[]
657
+ redpajama_incite_base_v1_3b,0.263,mmlu,helm_classic_240130,[]
658
+ tnlg_v2_6.7b,0.242,mmlu,helm_classic_240130,[]
659
+ j1_large_v1_7.5b,0.241,mmlu,helm_classic_240130,[]
660
+ gpt_j_6b,0.249,mmlu,helm_classic_240130,[]
661
+ pythia_12b,0.274,mmlu,helm_classic_240130,[]
662
+ curie_6.7b,0.243,mmlu,helm_classic_240130,[]
663
+ falcon_instruct_7b,0.275,mmlu,helm_classic_240130,[]
664
+ cohere_medium_v20220720_6.1b,0.279,mmlu,helm_classic_240130,[]
665
+ text_babbage_001,0.229,mmlu,helm_classic_240130,[]
666
+ t0pp_11b,0.407,mmlu,helm_classic_240130,[]
667
+ pythia_6.9b,0.236,mmlu,helm_classic_240130,[]
668
+ ul2_20b,0.291,mmlu,helm_classic_240130,[]
669
+ t5_11b,0.29,mmlu,helm_classic_240130,[]
670
+ babbage_1.3b,0.235,mmlu,helm_classic_240130,[]
671
+ cohere_small_v20220720_410m,0.264,mmlu,helm_classic_240130,[]
672
+ ada_350m,0.243,mmlu,helm_classic_240130,[]
673
+ text_ada_001,0.238,mmlu,helm_classic_240130,[]
674
+ yalm_100b,0.243,mmlu,helm_classic_240130,[]
675
+ aya_101,0.029411764705882353,biggen_mwr,biggen_240612,[]
676
+ c4ai_command_r_plus_gptq,0.8382352941176471,biggen_mwr,biggen_240612,[]
677
+ c4ai_command_r_v01,0.6948529411764706,biggen_mwr,biggen_240612,[]
678
+ claude_3_haiku_20240307,0.9252450980392157,biggen_mwr,biggen_240612,[]
679
+ claude_3_opus_20240229,0.9681372549019608,biggen_mwr,biggen_240612,[]
680
+ claude_3_sonnet_20240229,0.9240196078431373,biggen_mwr,biggen_240612,[]
681
+ codellama_13b,0.07598039215686275,biggen_mwr,biggen_240612,[]
682
+ codellama_13b_instruct,0.4276960784313726,biggen_mwr,biggen_240612,[]
683
+ codellama_34b,0.1482843137254902,biggen_mwr,biggen_240612,[]
684
+ codellama_34b_instruct,0.5098039215686274,biggen_mwr,biggen_240612,[]
685
+ codellama_70b,0.18872549019607843,biggen_mwr,biggen_240612,[]
686
+ codellama_70b_instruct,0.27450980392156865,biggen_mwr,biggen_240612,[]
687
+ codellama_7b,0.05514705882352941,biggen_mwr,biggen_240612,[]
688
+ codellama_7b_instruct,0.36519607843137253,biggen_mwr,biggen_240612,[]
689
+ codetulu_2_13b,0.43137254901960786,biggen_mwr,biggen_240612,[]
690
+ codetulu_2_34b,0.5441176470588235,biggen_mwr,biggen_240612,[]
691
+ codetulu_2_7b,0.32598039215686275,biggen_mwr,biggen_240612,[]
692
+ gemini_1.0_pro,0.7107843137254902,biggen_mwr,biggen_240612,[]
693
+ gemini_flash_1.5,0.866421568627451,biggen_mwr,biggen_240612,[]
694
+ gemini_pro_1.5,0.8676470588235294,biggen_mwr,biggen_240612,[]
695
+ gemma_1.1_2b_it,0.33578431372549017,biggen_mwr,biggen_240612,[]
696
+ gemma_1.1_7b_it,0.5551470588235294,biggen_mwr,biggen_240612,[]
697
+ gemma_2b,0.09803921568627451,biggen_mwr,biggen_240612,[]
698
+ gemma_2b_it,0.3333333333333333,biggen_mwr,biggen_240612,[]
699
+ gemma_7b,0.013480392156862746,biggen_mwr,biggen_240612,[]
700
+ gemma_7b_it,0.40931372549019607,biggen_mwr,biggen_240612,[]
701
+ gpt_3.5_turbo_0125,0.7757352941176471,biggen_mwr,biggen_240612,[]
702
+ gpt_3.5_turbo_1106,0.758578431372549,biggen_mwr,biggen_240612,[]
703
+ gpt_4_0125_preview,0.9779411764705882,biggen_mwr,biggen_240612,[]
704
+ gpt_4_1106_preview,0.9889705882352942,biggen_mwr,biggen_240612,[]
705
+ gpt_4_turbo_2024_04_09,0.9558823529411765,biggen_mwr,biggen_240612,[]
706
+ gpt_4o_2024_05_13,0.9436274509803921,biggen_mwr,biggen_240612,[]
707
+ llama_2_13b,0.20220588235294118,biggen_mwr,biggen_240612,[]
708
+ llama_2_13b_chat,0.5968137254901961,biggen_mwr,biggen_240612,[]
709
+ llama_2_70b,0.4656862745098039,biggen_mwr,biggen_240612,[]
710
+ llama_2_70b_chat,0.7205882352941176,biggen_mwr,biggen_240612,[]
711
+ llama_2_7b,0.1446078431372549,biggen_mwr,biggen_240612,[]
712
+ llama_2_7b_chat,0.5355392156862745,biggen_mwr,biggen_240612,[]
713
+ llemma_34b,0.21200980392156862,biggen_mwr,biggen_240612,[]
714
+ llemma_7b,0.11029411764705882,biggen_mwr,biggen_240612,[]
715
+ meta_llama_3_70b,0.36887254901960786,biggen_mwr,biggen_240612,[]
716
+ meta_llama_3_70b_instruct,0.875,biggen_mwr,biggen_240612,[]
717
+ meta_llama_3_8b,0.2377450980392157,biggen_mwr,biggen_240612,[]
718
+ meta_llama_3_8b_instruct,0.7328431372549019,biggen_mwr,biggen_240612,[]
719
+ mistral_7b_instruct_v0.2,0.7156862745098039,biggen_mwr,biggen_240612,[]
720
+ mistral_7b_v0.1,0.3272058823529412,biggen_mwr,biggen_240612,[]
721
+ mistral_7b_v0.2,0.3137254901960784,biggen_mwr,biggen_240612,[]
722
+ mistral_large_hjpark,0.8762254901960784,biggen_mwr,biggen_240612,[]
723
+ mistral_medium_hjpark,0.8970588235294118,biggen_mwr,biggen_240612,[]
724
+ mistral_orpo_alpha,0.5392156862745098,biggen_mwr,biggen_240612,[]
725
+ mistral_orpo_beta,0.5477941176470589,biggen_mwr,biggen_240612,[]
726
+ mixtral_8x22b_instruct_v0.1_awq,0.8198529411764706,biggen_mwr,biggen_240612,[]
727
+ mixtral_8x22b_v0.1_awq,0.5968137254901961,biggen_mwr,biggen_240612,[]
728
+ mixtral_8x7b_instruct_v0.1,0.7647058823529411,biggen_mwr,biggen_240612,[]
729
+ mixtral_8x7b_v0.1,0.5453431372549019,biggen_mwr,biggen_240612,[]
730
+ nous_hermes_2_mistral_7b_dpo,0.571078431372549,biggen_mwr,biggen_240612,[]
731
+ nous_hermes_2_mixtral_8x7b_dpo,0.7095588235294118,biggen_mwr,biggen_240612,[]
732
+ nous_hermes_2_mixtral_8x7b_sft,0.6262254901960784,biggen_mwr,biggen_240612,[]
733
+ nous_hermes_2_yi_34b,0.5906862745098039,biggen_mwr,biggen_240612,[]
734
+ olmo_1b,0.028186274509803922,biggen_mwr,biggen_240612,[]
735
+ olmo_7b,0.07107843137254902,biggen_mwr,biggen_240612,[]
736
+ olmo_7b_instruct,0.30269607843137253,biggen_mwr,biggen_240612,[]
737
+ olmo_7b_sft,0.2549019607843137,biggen_mwr,biggen_240612,[]
738
+ openchat_3.5_0106,0.6825980392156863,biggen_mwr,biggen_240612,[]
739
+ openhermes_2.5_mistral_7b,0.4583333333333333,biggen_mwr,biggen_240612,[]
740
+ openhermes_2_mistral_7b,0.5122549019607843,biggen_mwr,biggen_240612,[]
741
+ orca_2_13b,0.17401960784313725,biggen_mwr,biggen_240612,[]
742
+ orca_2_7b,0.08700980392156862,biggen_mwr,biggen_240612,[]
743
+ phi_1,0.0,biggen_mwr,biggen_240612,[]
744
+ phi_1_5,0.15318627450980393,biggen_mwr,biggen_240612,[]
745
+ phi_2,0.29044117647058826,biggen_mwr,biggen_240612,[]
746
+ phi_3_mini_128k_instruct,0.6911764705882353,biggen_mwr,biggen_240612,[]
747
+ phi_3_mini_4k_instruct,0.7867647058823529,biggen_mwr,biggen_240612,[]
748
+ qwen1.5_0.5b,0.0428921568627451,biggen_mwr,biggen_240612,[]
749
+ qwen1.5_0.5b_chat,0.07965686274509803,biggen_mwr,biggen_240612,[]
750
+ qwen1.5_1.8b,0.12867647058823528,biggen_mwr,biggen_240612,[]
751
+ qwen1.5_1.8b_chat,0.21691176470588236,biggen_mwr,biggen_240612,[]
752
+ qwen1.5_14b,0.3946078431372549,biggen_mwr,biggen_240612,[]
753
+ qwen1.5_14b_chat,0.7267156862745098,biggen_mwr,biggen_240612,[]
754
+ qwen1.5_32b,0.4791666666666667,biggen_mwr,biggen_240612,[]
755
+ qwen1.5_32b_chat,0.8149509803921569,biggen_mwr,biggen_240612,[]
756
+ qwen1.5_4b,0.21323529411764705,biggen_mwr,biggen_240612,[]
757
+ qwen1.5_4b_chat,0.29411764705882354,biggen_mwr,biggen_240612,[]
758
+ qwen1.5_72b,0.5294117647058824,biggen_mwr,biggen_240612,[]
759
+ qwen1.5_72b_chat,0.8713235294117647,biggen_mwr,biggen_240612,[]
760
+ qwen1.5_7b,0.2610294117647059,biggen_mwr,biggen_240612,[]
761
+ qwen1.5_7b_chat,0.6580882352941176,biggen_mwr,biggen_240612,[]
762
+ qwen_110b_chat,0.8848039215686274,biggen_mwr,biggen_240612,[]
763
+ solar_10.7b_instruct_v1.0,0.6862745098039216,biggen_mwr,biggen_240612,[]
764
+ solar_10.7b_v1.0,0.43995098039215685,biggen_mwr,biggen_240612,[]
765
+ starling_lm_7b_alpha,0.6139705882352942,biggen_mwr,biggen_240612,[]
766
+ starling_lm_7b_beta,0.7573529411764706,biggen_mwr,biggen_240612,[]
767
+ tulu_2_13b,0.4313725490196078,biggen_mwr,biggen_240612,[]
768
+ tulu_2_7b,0.3553921568627451,biggen_mwr,biggen_240612,[]
769
+ tulu_2_dpo_13b,0.5833333333333333,biggen_mwr,biggen_240612,[]
770
+ tulu_2_dpo_70b,0.7708333333333334,biggen_mwr,biggen_240612,[]
771
+ tulu_2_dpo_7b,0.4767156862745098,biggen_mwr,biggen_240612,[]
772
+ yi_34b,0.46078431372549017,biggen_mwr,biggen_240612,[]
773
+ yi_34b_chat,0.7720588235294118,biggen_mwr,biggen_240612,[]
774
+ yi_6b,0.17892156862745098,biggen_mwr,biggen_240612,[]
775
+ yi_6b_chat,0.4117647058823529,biggen_mwr,biggen_240612,[]
776
+ zephyr_7b_beta,0.6200980392156863,biggen_mwr,biggen_240612,[]
777
+ zephyr_orpo_141b_a35b_v0.1_awq,0.6311274509803921,biggen_mwr,biggen_240612,[]
778
+ gpt_4o_0513,1293.0,arena_elo,wildbench_240612,[]
779
+ gpt_4_turbo_0409,1251.0,arena_elo,wildbench_240612,[]
780
+ gpt_4_turbo_0125,1239.0,arena_elo,wildbench_240612,[]
781
+ llama_3_70b_inst,1213.0,arena_elo,wildbench_240612,[]
782
+ claude_3_opus,1232.0,arena_elo,wildbench_240612,[]
783
+ claude_3_sonnet,1187.0,arena_elo,wildbench_240612,[]
784
+ qwen1.5_72b_chat,1143.0,arena_elo,wildbench_240612,[]
785
+ command_r_plus,1155.0,arena_elo,wildbench_240612,[]
786
+ claude_3_haiku,1169.0,arena_elo,wildbench_240612,[]
787
+ mistral_large,1158.0,arena_elo,wildbench_240612,[]
788
+ starlinglm_7b_beta,1111.0,arena_elo,wildbench_240612,[]
789
+ llama_3_8b_inst,1144.0,arena_elo,wildbench_240612,[]
790
+ command_r,1106.0,arena_elo,wildbench_240612,[]
791
+ mixtral_8x7b_inst,1114.0,arena_elo,wildbench_240612,[]
792
+ dbrx_instruct,1106.0,arena_elo,wildbench_240612,[]
793
+ mistral_7b_inst_v0.2,1071.0,arena_elo,wildbench_240612,[]
794
+ tulu_2_dpo_70b,1099.0,arena_elo,wildbench_240612,[]
795
+ llama_2_70b_chat,1070.0,arena_elo,wildbench_240612,[]
796
+ qwen1.5_7b_chat,1059.0,arena_elo,wildbench_240612,[]
797
+ gpt_3.5_turbo_0125,1105.0,arena_elo,wildbench_240612,[]
798
+ llama_2_7b_chat,1012.0,arena_elo,wildbench_240612,[]
799
+ gemma_7b_it,1047.0,arena_elo,wildbench_240612,[]
800
+ gemma_2b_it,980.0,arena_elo,wildbench_240612,[]
801
+ gpt_4_turbo_0409,82.6,arena_hard,wildbench_240612,[]
802
+ gpt_4_turbo_0125,78.0,arena_hard,wildbench_240612,[]
803
+ llama_3_70b_inst,41.1,arena_hard,wildbench_240612,[]
804
+ claude_3_opus,60.4,arena_hard,wildbench_240612,[]
805
+ llama3_inst_8b_simpo,33.8,arena_hard,wildbench_240612,[]
806
+ claude_3_sonnet,46.8,arena_hard,wildbench_240612,[]
807
+ qwen1.5_72b_chat,36.1,arena_hard,wildbench_240612,[]
808
+ command_r_plus,33.1,arena_hard,wildbench_240612,[]
809
+ claude_3_haiku,41.5,arena_hard,wildbench_240612,[]
810
+ mistral_large,37.7,arena_hard,wildbench_240612,[]
811
+ starlinglm_7b_beta,23.0,arena_hard,wildbench_240612,[]
812
+ llama_3_8b_inst,20.6,arena_hard,wildbench_240612,[]
813
+ command_r,17.0,arena_hard,wildbench_240612,[]
814
+ mixtral_8x7b_inst,23.4,arena_hard,wildbench_240612,[]
815
+ dbrx_instruct,23.9,arena_hard,wildbench_240612,[]
816
+ tulu_2_dpo_70b,15.0,arena_hard,wildbench_240612,[]
817
+ llama_2_70b_chat,11.6,arena_hard,wildbench_240612,[]
818
+ gpt_3.5_turbo_0125,23.3,arena_hard,wildbench_240612,[]
819
+ llama_2_7b_chat,4.6,arena_hard,wildbench_240612,[]
820
+ gemma_7b_it,7.5,arena_hard,wildbench_240612,[]
821
+ gemma_2b_it,3.0,arena_hard,wildbench_240612,[]
822
+ gpt_4o_0513,57.5,alpacaeval2_lc,wildbench_240612,[]
823
+ gpt_4_turbo_0409,55.0,alpacaeval2_lc,wildbench_240612,[]
824
+ llama_3_70b_inst,34.4,alpacaeval2_lc,wildbench_240612,[]
825
+ claude_3_opus,40.5,alpacaeval2_lc,wildbench_240612,[]
826
+ llama3_inst_8b_simpo,44.7,alpacaeval2_lc,wildbench_240612,[]
827
+ claude_3_sonnet,34.9,alpacaeval2_lc,wildbench_240612,[]
828
+ qwen1.5_72b_chat,36.6,alpacaeval2_lc,wildbench_240612,[]
829
+ mistral_large,32.7,alpacaeval2_lc,wildbench_240612,[]
830
+ llama_3_8b_inst,22.9,alpacaeval2_lc,wildbench_240612,[]
831
+ mixtral_8x7b_inst,23.7,alpacaeval2_lc,wildbench_240612,[]
832
+ dbrx_instruct,25.4,alpacaeval2_lc,wildbench_240612,[]
833
+ mistral_7b_inst_v0.2,17.1,alpacaeval2_lc,wildbench_240612,[]
834
+ tulu_2_dpo_70b,21.2,alpacaeval2_lc,wildbench_240612,[]
835
+ llama_2_70b_chat,14.7,alpacaeval2_lc,wildbench_240612,[]
836
+ qwen1.5_7b_chat,14.7,alpacaeval2_lc,wildbench_240612,[]
837
+ llama_2_7b_chat,5.4,alpacaeval2_lc,wildbench_240612,[]
838
+ gemma_7b_it,10.4,alpacaeval2_lc,wildbench_240612,[]
839
+ gemma_2b_it,5.4,alpacaeval2_lc,wildbench_240612,[]
840
+ gpt_4o_0513,51.3,alpacav2,wildbench_240612,[]
841
+ gpt_4_turbo_0409,46.1,alpacav2,wildbench_240612,[]
842
+ llama_3_70b_inst,33.2,alpacav2,wildbench_240612,[]
843
+ claude_3_opus,29.1,alpacav2,wildbench_240612,[]
844
+ llama3_inst_8b_simpo,40.5,alpacav2,wildbench_240612,[]
845
+ claude_3_sonnet,25.6,alpacav2,wildbench_240612,[]
846
+ qwen1.5_72b_chat,26.5,alpacav2,wildbench_240612,[]
847
+ mistral_large,21.4,alpacav2,wildbench_240612,[]
848
+ llama_3_8b_inst,22.6,alpacav2,wildbench_240612,[]
849
+ mixtral_8x7b_inst,18.3,alpacav2,wildbench_240612,[]
850
+ dbrx_instruct,18.4,alpacav2,wildbench_240612,[]
851
+ mistral_7b_inst_v0.2,14.7,alpacav2,wildbench_240612,[]
852
+ tulu_2_dpo_70b,16.0,alpacav2,wildbench_240612,[]
853
+ llama_2_70b_chat,13.9,alpacav2,wildbench_240612,[]
854
+ qwen1.5_7b_chat,11.8,alpacav2,wildbench_240612,[]
855
+ llama_2_7b_chat,5.0,alpacav2,wildbench_240612,[]
856
+ gemma_7b_it,6.9,alpacav2,wildbench_240612,[]
857
+ gemma_2b_it,3.4,alpacav2,wildbench_240612,[]
858
+ pythia_1b,31.4,arc_c,olmes_260624,[]
859
+ olmo_1b,38.6,arc_c,olmes_260624,[]
860
+ tinyllama_1.1b,38.1,arc_c,olmes_260624,[]
861
+ pythia_6.7b,44.6,arc_c,olmes_260624,[]
862
+ rpj_incite_7b,45.3,arc_c,olmes_260624,[]
863
+ stablelm2_1.6b,50.6,arc_c,olmes_260624,[]
864
+ olmo_7b,46.4,arc_c,olmes_260624,[]
865
+ mpt_7b,45.7,arc_c,olmes_260624,[]
866
+ falcon_7b,49.7,arc_c,olmes_260624,[]
867
+ llama2_7b,54.2,arc_c,olmes_260624,[]
868
+ llama2_13b,67.3,arc_c,olmes_260624,[]
869
+ olmo_1.7_7b,66.9,arc_c,olmes_260624,[]
870
+ llama3_8b,79.3,arc_c,olmes_260624,[]
871
+ mistral_7b_v0.1,78.6,arc_c,olmes_260624,[]
872
+ llama3_70b,93.7,arc_c,olmes_260624,[]
873
+ pythia_1b,31.1,mmlu,olmes_260624,[]
874
+ olmo_1b,33.4,mmlu,olmes_260624,[]
875
+ tinyllama_1.1b,33.6,mmlu,olmes_260624,[]
876
+ pythia_6.7b,37.7,mmlu,olmes_260624,[]
877
+ rpj_incite_7b,40.1,mmlu,olmes_260624,[]
878
+ stablelm2_1.6b,40.4,mmlu,olmes_260624,[]
879
+ olmo_7b,40.5,mmlu,olmes_260624,[]
880
+ mpt_7b,40.6,mmlu,olmes_260624,[]
881
+ falcon_7b,42.1,mmlu,olmes_260624,[]
882
+ llama2_7b,46.2,mmlu,olmes_260624,[]
883
+ llama2_13b,55.8,mmlu,olmes_260624,[]
884
+ olmo_1.7_7b,54.4,mmlu,olmes_260624,[]
885
+ llama3_8b,66.6,mmlu,olmes_260624,[]
886
+ mistral_7b_v0.1,64.0,mmlu,olmes_260624,[]
887
+ llama3_70b,79.8,mmlu,olmes_260624,[]
888
+ pythia_1b,49.0,olmes_average,olmes_260624,[]
889
+ olmo_1b,55.1,olmes_average,olmes_260624,[]
890
+ tinyllama_1.1b,55.4,olmes_average,olmes_260624,[]
891
+ pythia_6.7b,59.1,olmes_average,olmes_260624,[]
892
+ rpj_incite_7b,62.8,olmes_average,olmes_260624,[]
893
+ stablelm2_1.6b,65.1,olmes_average,olmes_260624,[]
894
+ olmo_7b,65.3,olmes_average,olmes_260624,[]
895
+ mpt_7b,65.6,olmes_average,olmes_260624,[]
896
+ falcon_7b,66.9,olmes_average,olmes_260624,[]
897
+ llama2_7b,69.0,olmes_average,olmes_260624,[]
898
+ llama2_13b,74.0,olmes_average,olmes_260624,[]
899
+ olmo_1.7_7b,75.5,olmes_average,olmes_260624,[]
900
+ llama3_8b,78.7,olmes_average,olmes_260624,[]
901
+ mistral_7b_v0.1,79.1,olmes_average,olmes_260624,[]
902
+ llama3_70b,88.4,olmes_average,olmes_260624,[]
903
+ llama_2_70b,0.3753,mmlu_pro,mmlu_pro_240610,[]
904
+ llama_3_8b,0.3536,mmlu_pro,mmlu_pro_240610,[]
905
+ deepseekmath_instruct,0.353,mmlu_pro,mmlu_pro_240610,[]
906
+ gemma_7b,0.3373,mmlu_pro,mmlu_pro_240610,[]
907
+ mistral_7b_v0.1,0.3088,mmlu_pro,mmlu_pro_240610,[]
908
+ mistral_7b_instruct_v0.2,0.3084,mmlu_pro,mmlu_pro_240610,[]
909
+ mistral_7b_v0.2,0.3043,mmlu_pro,mmlu_pro_240610,[]
910
+ qwen1.5_7b_chat,0.2906,mmlu_pro,mmlu_pro_240610,[]
911
+ yi_6b_chat,0.2884,mmlu_pro,mmlu_pro_240610,[]
912
+ yi_6b,0.2651,mmlu_pro,mmlu_pro_240610,[]
913
+ mistral_7b_instruct_v0.1,0.2575,mmlu_pro,mmlu_pro_240610,[]
914
+ llama_2_13b,0.2534,mmlu_pro,mmlu_pro_240610,[]
915
+ llemma_7b,0.2345,mmlu_pro,mmlu_pro_240610,[]
916
+ llama_2_7b,0.2032,mmlu_pro,mmlu_pro_240610,[]
917
+ gpt_4o,0.7255,mmlu_pro,mmlu_pro_240610,[]
918
+ claude_3_opus,0.6845,mmlu_pro,mmlu_pro_240610,[]
919
+ gpt_4_turbo,0.6371,mmlu_pro,mmlu_pro_240610,[]
920
+ gemini_1.5_flash,0.5912,mmlu_pro,mmlu_pro_240610,[]
921
+ yi_large,0.5753,mmlu_pro,mmlu_pro_240610,[]
922
+ claude_3_sonnet,0.568,mmlu_pro,mmlu_pro_240610,[]
923
+ llama_3_70b_instruct,0.562,mmlu_pro,mmlu_pro_240610,[]
924
+ deepseek_v2,0.5481,mmlu_pro,mmlu_pro_240610,[]
925
+ phi_3_medium_4k_instruct,0.5348,mmlu_pro,mmlu_pro_240610,[]
926
+ llama_3_70b,0.5278,mmlu_pro,mmlu_pro_240610,[]
927
+ qwen1.5_72b_chat,0.5162,mmlu_pro,mmlu_pro_240610,[]
928
+ mammoth2_8x7b_plus,0.504,mmlu_pro,mmlu_pro_240610,[]
929
+ qwen1.5_110b,0.4993,mmlu_pro,mmlu_pro_240610,[]
930
+ mammoth2_8b_plus,0.4335,mmlu_pro,mmlu_pro_240610,[]
931
+ mixtral_8x7b_instruct_v0.1,0.4327,mmlu_pro,mmlu_pro_240610,[]
932
+ phi_3_mini_4k_instruct,0.4317,mmlu_pro,mmlu_pro_240610,[]
933
+ yi_34b,0.4303,mmlu_pro,mmlu_pro_240610,[]
934
+ mixtral_8x7b_v0.1,0.4103,mmlu_pro,mmlu_pro_240610,[]
935
+ llama_3_8b_instruct,0.4098,mmlu_pro,mmlu_pro_240610,[]
936
+ mammoth2_7b_plus,0.4085,mmlu_pro,mmlu_pro_240610,[]
937
+ qwen1.5_14b_chat,0.3802,mmlu_pro,mmlu_pro_240610,[]
938
+ c4ai_command_r_v01,0.379,mmlu_pro,mmlu_pro_240610,[]
assets/livebench.csv ADDED
@@ -0,0 +1,365 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ,model,scenario,score,aggragated_from,source
2
+ 0,claude_3_5_sonnet_20240620,livebench_lb,61.16,[],livebench_240701
3
+ 1,gpt_4o_2024_05_13,livebench_lb,54.96,[],livebench_240701
4
+ 2,gpt_4_turbo_2024_04_09,livebench_lb,53.0,[],livebench_240701
5
+ 3,gpt_4_1106_preview,livebench_lb,52.17,[],livebench_240701
6
+ 4,claude_3_opus_20240229,livebench_lb,50.75,[],livebench_240701
7
+ 5,gpt_4_0125_preview,livebench_lb,49.39,[],livebench_240701
8
+ 6,deepseek_coder_v2,livebench_lb,46.79,[],livebench_240701
9
+ 7,gemini_1.5_pro_api_0514,livebench_lb,44.35,[],livebench_240701
10
+ 8,gemma_2_27b_it,livebench_lb,41.22,[],livebench_240701
11
+ 9,gemini_1.5_flash_api_0514,livebench_lb,40.89,[],livebench_240701
12
+ 10,qwen2_72b_instruct,livebench_lb,40.16,[],livebench_240701
13
+ 11,acm_rewrite_qwen2_72b_chat,livebench_lb,39.6,[],livebench_240701
14
+ 12,mistral_large_2402,livebench_lb,38.92,[],livebench_240701
15
+ 13,deepseek_chat_v2,livebench_lb,38.39,[],livebench_240701
16
+ 14,claude_3_sonnet_20240229,livebench_lb,38.08,[],livebench_240701
17
+ 15,meta_llama_3_70b_instruct,livebench_lb,37.38,[],livebench_240701
18
+ 16,claude_3_haiku_20240307,livebench_lb,35.32,[],livebench_240701
19
+ 17,mixtral_8x22b_instruct_v0.1,livebench_lb,34.84,[],livebench_240701
20
+ 18,gpt_3.5_turbo_0125,livebench_lb,34.43,[],livebench_240701
21
+ 19,gpt_3.5_turbo_1106,livebench_lb,34.14,[],livebench_240701
22
+ 20,command_r_plus,livebench_lb,32.86,[],livebench_240701
23
+ 21,mistral_small_2402,livebench_lb,32.8,[],livebench_240701
24
+ 22,gemma_2_9b_it,livebench_lb,31.57,[],livebench_240701
25
+ 23,phi_3_medium_4k_instruct,livebench_lb,30.33,[],livebench_240701
26
+ 24,phi_3_medium_128k_instruct,livebench_lb,29.64,[],livebench_240701
27
+ 25,deepseek_coder_v2_lite_instruct,livebench_lb,29.15,[],livebench_240701
28
+ 26,qwen1.5_110b_chat,livebench_lb,28.96,[],livebench_240701
29
+ 27,qwen1.5_72b_chat,livebench_lb,28.89,[],livebench_240701
30
+ 28,command_r,livebench_lb,27.23,[],livebench_240701
31
+ 29,phi_3_small_128k_instruct,livebench_lb,27.19,[],livebench_240701
32
+ 30,meta_llama_3_8b_instruct,livebench_lb,26.67,[],livebench_240701
33
+ 31,qwen2_7b_instruct,livebench_lb,26.45,[],livebench_240701
34
+ 32,phi_3_small_8k_instruct,livebench_lb,26.24,[],livebench_240701
35
+ 33,openhermes_2.5_mistral_7b,livebench_lb,23.3,[],livebench_240701
36
+ 34,mixtral_8x7b_instruct_v0.1,livebench_lb,22.5,[],livebench_240701
37
+ 35,mistral_7b_instruct_v0.2,livebench_lb,19.33,[],livebench_240701
38
+ 36,phi_3_mini_4k_instruct,livebench_lb,19.27,[],livebench_240701
39
+ 37,zephyr_7b_alpha,livebench_lb,19.22,[],livebench_240701
40
+ 38,phi_3_mini_128k_instruct,livebench_lb,18.04,[],livebench_240701
41
+ 39,zephyr_7b_beta,livebench_lb,17.32,[],livebench_240701
42
+ 40,deepseek_v2_lite_chat,livebench_lb,17.14,[],livebench_240701
43
+ 41,qwen1.5_7b_chat,livebench_lb,16.5,[],livebench_240701
44
+ 42,starling_lm_7b_beta,livebench_lb,16.44,[],livebench_240701
45
+ 43,vicuna_7b_v1.5_16k,livebench_lb,13.71,[],livebench_240701
46
+ 44,vicuna_7b_v1.5,livebench_lb,11.73,[],livebench_240701
47
+ 45,qwen1.5_4b_chat,livebench_lb,11.13,[],livebench_240701
48
+ 46,llama_2_7b_chat,livebench_lb,10.25,[],livebench_240701
49
+ 47,qwen2_1.5b_instruct,livebench_lb,9.96,[],livebench_240701
50
+ 48,yi_6b_chat,livebench_lb,8.79,[],livebench_240701
51
+ 49,qwen2_0.5b_instruct,livebench_lb,6.78,[],livebench_240701
52
+ 50,qwen1.5_1.8b_chat,livebench_lb,6.09,[],livebench_240701
53
+ 51,qwen1.5_0.5b_chat,livebench_lb,5.26,[],livebench_240701
54
+ 52,claude_3_5_sonnet_20240620,reasoning_lb,64.0,[],livebench_240701
55
+ 53,gpt_4o_2024_05_13,reasoning_lb,55.0,[],livebench_240701
56
+ 54,gpt_4_turbo_2024_04_09,reasoning_lb,54.0,[],livebench_240701
57
+ 55,gpt_4_1106_preview,reasoning_lb,52.0,[],livebench_240701
58
+ 56,claude_3_opus_20240229,reasoning_lb,41.0,[],livebench_240701
59
+ 57,gpt_4_0125_preview,reasoning_lb,48.0,[],livebench_240701
60
+ 58,deepseek_coder_v2,reasoning_lb,49.0,[],livebench_240701
61
+ 59,gemini_1.5_pro_api_0514,reasoning_lb,33.0,[],livebench_240701
62
+ 60,gemma_2_27b_it,reasoning_lb,31.0,[],livebench_240701
63
+ 61,gemini_1.5_flash_api_0514,reasoning_lb,30.0,[],livebench_240701
64
+ 62,qwen2_72b_instruct,reasoning_lb,42.0,[],livebench_240701
65
+ 63,acm_rewrite_qwen2_72b_chat,reasoning_lb,37.0,[],livebench_240701
66
+ 64,mistral_large_2402,reasoning_lb,35.0,[],livebench_240701
67
+ 65,deepseek_chat_v2,reasoning_lb,29.0,[],livebench_240701
68
+ 66,claude_3_sonnet_20240229,reasoning_lb,26.0,[],livebench_240701
69
+ 67,meta_llama_3_70b_instruct,reasoning_lb,31.0,[],livebench_240701
70
+ 68,claude_3_haiku_20240307,reasoning_lb,26.0,[],livebench_240701
71
+ 69,mixtral_8x22b_instruct_v0.1,reasoning_lb,29.0,[],livebench_240701
72
+ 70,gpt_3.5_turbo_0125,reasoning_lb,26.0,[],livebench_240701
73
+ 71,gpt_3.5_turbo_1106,reasoning_lb,28.0,[],livebench_240701
74
+ 72,command_r_plus,reasoning_lb,32.0,[],livebench_240701
75
+ 73,mistral_small_2402,reasoning_lb,28.0,[],livebench_240701
76
+ 74,gemma_2_9b_it,reasoning_lb,19.0,[],livebench_240701
77
+ 75,phi_3_medium_4k_instruct,reasoning_lb,35.0,[],livebench_240701
78
+ 76,phi_3_medium_128k_instruct,reasoning_lb,31.0,[],livebench_240701
79
+ 77,deepseek_coder_v2_lite_instruct,reasoning_lb,22.0,[],livebench_240701
80
+ 78,qwen1.5_110b_chat,reasoning_lb,26.0,[],livebench_240701
81
+ 79,qwen1.5_72b_chat,reasoning_lb,21.0,[],livebench_240701
82
+ 80,command_r,reasoning_lb,28.0,[],livebench_240701
83
+ 81,phi_3_small_128k_instruct,reasoning_lb,36.0,[],livebench_240701
84
+ 82,meta_llama_3_8b_instruct,reasoning_lb,25.0,[],livebench_240701
85
+ 83,qwen2_7b_instruct,reasoning_lb,20.0,[],livebench_240701
86
+ 84,phi_3_small_8k_instruct,reasoning_lb,23.0,[],livebench_240701
87
+ 85,openhermes_2.5_mistral_7b,reasoning_lb,17.0,[],livebench_240701
88
+ 86,mixtral_8x7b_instruct_v0.1,reasoning_lb,18.0,[],livebench_240701
89
+ 87,mistral_7b_instruct_v0.2,reasoning_lb,13.0,[],livebench_240701
90
+ 88,phi_3_mini_4k_instruct,reasoning_lb,19.0,[],livebench_240701
91
+ 89,zephyr_7b_alpha,reasoning_lb,17.0,[],livebench_240701
92
+ 90,phi_3_mini_128k_instruct,reasoning_lb,10.0,[],livebench_240701
93
+ 91,zephyr_7b_beta,reasoning_lb,16.0,[],livebench_240701
94
+ 92,deepseek_v2_lite_chat,reasoning_lb,13.0,[],livebench_240701
95
+ 93,qwen1.5_7b_chat,reasoning_lb,13.0,[],livebench_240701
96
+ 94,starling_lm_7b_beta,reasoning_lb,19.0,[],livebench_240701
97
+ 95,vicuna_7b_v1.5_16k,reasoning_lb,15.0,[],livebench_240701
98
+ 96,vicuna_7b_v1.5,reasoning_lb,12.0,[],livebench_240701
99
+ 97,qwen1.5_4b_chat,reasoning_lb,13.0,[],livebench_240701
100
+ 98,llama_2_7b_chat,reasoning_lb,5.0,[],livebench_240701
101
+ 99,qwen2_1.5b_instruct,reasoning_lb,8.0,[],livebench_240701
102
+ 100,yi_6b_chat,reasoning_lb,8.0,[],livebench_240701
103
+ 101,qwen2_0.5b_instruct,reasoning_lb,3.0,[],livebench_240701
104
+ 102,qwen1.5_1.8b_chat,reasoning_lb,5.0,[],livebench_240701
105
+ 103,qwen1.5_0.5b_chat,reasoning_lb,4.0,[],livebench_240701
106
+ 104,claude_3_5_sonnet_20240620,coding_lb,63.21,[],livebench_240701
107
+ 105,gpt_4o_2024_05_13,coding_lb,46.37,[],livebench_240701
108
+ 106,gpt_4_turbo_2024_04_09,coding_lb,47.05,[],livebench_240701
109
+ 107,gpt_4_1106_preview,coding_lb,44.37,[],livebench_240701
110
+ 108,claude_3_opus_20240229,coding_lb,40.05,[],livebench_240701
111
+ 109,gpt_4_0125_preview,coding_lb,44.05,[],livebench_240701
112
+ 110,deepseek_coder_v2,coding_lb,41.05,[],livebench_240701
113
+ 111,gemini_1.5_pro_api_0514,coding_lb,32.79,[],livebench_240701
114
+ 112,gemma_2_27b_it,coding_lb,36.74,[],livebench_240701
115
+ 113,gemini_1.5_flash_api_0514,coding_lb,39.05,[],livebench_240701
116
+ 114,qwen2_72b_instruct,coding_lb,31.79,[],livebench_240701
117
+ 115,acm_rewrite_qwen2_72b_chat,coding_lb,39.05,[],livebench_240701
118
+ 116,mistral_large_2402,coding_lb,26.84,[],livebench_240701
119
+ 117,deepseek_chat_v2,coding_lb,33.47,[],livebench_240701
120
+ 118,claude_3_sonnet_20240229,coding_lb,25.21,[],livebench_240701
121
+ 119,meta_llama_3_70b_instruct,coding_lb,20.95,[],livebench_240701
122
+ 120,claude_3_haiku_20240307,coding_lb,24.53,[],livebench_240701
123
+ 121,mixtral_8x22b_instruct_v0.1,coding_lb,33.11,[],livebench_240701
124
+ 122,gpt_3.5_turbo_0125,coding_lb,29.16,[],livebench_240701
125
+ 123,gpt_3.5_turbo_1106,coding_lb,26.84,[],livebench_240701
126
+ 124,command_r_plus,coding_lb,20.26,[],livebench_240701
127
+ 125,mistral_small_2402,coding_lb,24.21,[],livebench_240701
128
+ 126,gemma_2_9b_it,coding_lb,22.21,[],livebench_240701
129
+ 127,phi_3_medium_4k_instruct,coding_lb,20.58,[],livebench_240701
130
+ 128,phi_3_medium_128k_instruct,coding_lb,21.58,[],livebench_240701
131
+ 129,deepseek_coder_v2_lite_instruct,coding_lb,26.84,[],livebench_240701
132
+ 130,qwen1.5_110b_chat,coding_lb,22.21,[],livebench_240701
133
+ 131,qwen1.5_72b_chat,coding_lb,22.89,[],livebench_240701
134
+ 132,command_r,coding_lb,14.95,[],livebench_240701
135
+ 133,phi_3_small_128k_instruct,coding_lb,25.84,[],livebench_240701
136
+ 134,meta_llama_3_8b_instruct,coding_lb,18.26,[],livebench_240701
137
+ 135,qwen2_7b_instruct,coding_lb,29.21,[],livebench_240701
138
+ 136,phi_3_small_8k_instruct,coding_lb,19.58,[],livebench_240701
139
+ 137,openhermes_2.5_mistral_7b,coding_lb,11.63,[],livebench_240701
140
+ 138,mixtral_8x7b_instruct_v0.1,coding_lb,11.32,[],livebench_240701
141
+ 139,mistral_7b_instruct_v0.2,coding_lb,11.63,[],livebench_240701
142
+ 140,phi_3_mini_4k_instruct,coding_lb,14.95,[],livebench_240701
143
+ 141,zephyr_7b_alpha,coding_lb,11.32,[],livebench_240701
144
+ 142,phi_3_mini_128k_instruct,coding_lb,11.63,[],livebench_240701
145
+ 143,zephyr_7b_beta,coding_lb,8.32,[],livebench_240701
146
+ 144,deepseek_v2_lite_chat,coding_lb,8.63,[],livebench_240701
147
+ 145,qwen1.5_7b_chat,coding_lb,6.63,[],livebench_240701
148
+ 146,starling_lm_7b_beta,coding_lb,18.26,[],livebench_240701
149
+ 147,vicuna_7b_v1.5_16k,coding_lb,1.32,[],livebench_240701
150
+ 148,vicuna_7b_v1.5,coding_lb,1.0,[],livebench_240701
151
+ 149,qwen1.5_4b_chat,coding_lb,4.0,[],livebench_240701
152
+ 150,llama_2_7b_chat,coding_lb,0.0,[],livebench_240701
153
+ 151,qwen2_1.5b_instruct,coding_lb,5.63,[],livebench_240701
154
+ 152,yi_6b_chat,coding_lb,1.32,[],livebench_240701
155
+ 153,qwen2_0.5b_instruct,coding_lb,2.0,[],livebench_240701
156
+ 154,qwen1.5_1.8b_chat,coding_lb,0.0,[],livebench_240701
157
+ 155,qwen1.5_0.5b_chat,coding_lb,0.0,[],livebench_240701
158
+ 156,claude_3_5_sonnet_20240620,mathematics_lb,53.75,[],livebench_240701
159
+ 157,gpt_4o_2024_05_13,mathematics_lb,49.88,[],livebench_240701
160
+ 158,gpt_4_turbo_2024_04_09,mathematics_lb,48.99,[],livebench_240701
161
+ 159,gpt_4_1106_preview,mathematics_lb,47.55,[],livebench_240701
162
+ 160,claude_3_opus_20240229,mathematics_lb,46.54,[],livebench_240701
163
+ 161,gpt_4_0125_preview,mathematics_lb,42.75,[],livebench_240701
164
+ 162,deepseek_coder_v2,mathematics_lb,52.19,[],livebench_240701
165
+ 163,gemini_1.5_pro_api_0514,mathematics_lb,42.07,[],livebench_240701
166
+ 164,gemma_2_27b_it,mathematics_lb,36.23,[],livebench_240701
167
+ 165,gemini_1.5_flash_api_0514,mathematics_lb,38.54,[],livebench_240701
168
+ 166,qwen2_72b_instruct,mathematics_lb,43.44,[],livebench_240701
169
+ 167,acm_rewrite_qwen2_72b_chat,mathematics_lb,40.32,[],livebench_240701
170
+ 168,mistral_large_2402,mathematics_lb,32.2,[],livebench_240701
171
+ 169,deepseek_chat_v2,mathematics_lb,33.23,[],livebench_240701
172
+ 170,claude_3_sonnet_20240229,mathematics_lb,29.65,[],livebench_240701
173
+ 171,meta_llama_3_70b_instruct,mathematics_lb,32.31,[],livebench_240701
174
+ 172,claude_3_haiku_20240307,mathematics_lb,25.72,[],livebench_240701
175
+ 173,mixtral_8x22b_instruct_v0.1,mathematics_lb,26.94,[],livebench_240701
176
+ 174,gpt_3.5_turbo_0125,mathematics_lb,25.54,[],livebench_240701
177
+ 175,gpt_3.5_turbo_1106,mathematics_lb,28.13,[],livebench_240701
178
+ 176,command_r_plus,mathematics_lb,24.85,[],livebench_240701
179
+ 177,mistral_small_2402,mathematics_lb,26.76,[],livebench_240701
180
+ 178,gemma_2_9b_it,mathematics_lb,23.98,[],livebench_240701
181
+ 179,phi_3_medium_4k_instruct,mathematics_lb,27.54,[],livebench_240701
182
+ 180,phi_3_medium_128k_instruct,mathematics_lb,24.25,[],livebench_240701
183
+ 181,deepseek_coder_v2_lite_instruct,mathematics_lb,34.09,[],livebench_240701
184
+ 182,qwen1.5_110b_chat,mathematics_lb,25.58,[],livebench_240701
185
+ 183,qwen1.5_72b_chat,mathematics_lb,26.82,[],livebench_240701
186
+ 184,command_r,mathematics_lb,16.92,[],livebench_240701
187
+ 185,phi_3_small_128k_instruct,mathematics_lb,24.84,[],livebench_240701
188
+ 186,meta_llama_3_8b_instruct,mathematics_lb,17.58,[],livebench_240701
189
+ 187,qwen2_7b_instruct,mathematics_lb,25.83,[],livebench_240701
190
+ 188,phi_3_small_8k_instruct,mathematics_lb,24.15,[],livebench_240701
191
+ 189,openhermes_2.5_mistral_7b,mathematics_lb,20.1,[],livebench_240701
192
+ 190,mixtral_8x7b_instruct_v0.1,mathematics_lb,18.97,[],livebench_240701
193
+ 191,mistral_7b_instruct_v0.2,mathematics_lb,16.04,[],livebench_240701
194
+ 192,phi_3_mini_4k_instruct,mathematics_lb,19.88,[],livebench_240701
195
+ 193,zephyr_7b_alpha,mathematics_lb,9.61,[],livebench_240701
196
+ 194,phi_3_mini_128k_instruct,mathematics_lb,21.48,[],livebench_240701
197
+ 195,zephyr_7b_beta,mathematics_lb,11.23,[],livebench_240701
198
+ 196,deepseek_v2_lite_chat,mathematics_lb,11.99,[],livebench_240701
199
+ 197,qwen1.5_7b_chat,mathematics_lb,12.86,[],livebench_240701
200
+ 198,starling_lm_7b_beta,mathematics_lb,13.82,[],livebench_240701
201
+ 199,vicuna_7b_v1.5_16k,mathematics_lb,6.61,[],livebench_240701
202
+ 200,vicuna_7b_v1.5,mathematics_lb,4.33,[],livebench_240701
203
+ 201,qwen1.5_4b_chat,mathematics_lb,7.08,[],livebench_240701
204
+ 202,llama_2_7b_chat,mathematics_lb,4.78,[],livebench_240701
205
+ 203,qwen2_1.5b_instruct,mathematics_lb,7.16,[],livebench_240701
206
+ 204,yi_6b_chat,mathematics_lb,7.14,[],livebench_240701
207
+ 205,qwen2_0.5b_instruct,mathematics_lb,4.22,[],livebench_240701
208
+ 206,qwen1.5_1.8b_chat,mathematics_lb,2.14,[],livebench_240701
209
+ 207,qwen1.5_0.5b_chat,mathematics_lb,3.39,[],livebench_240701
210
+ 208,claude_3_5_sonnet_20240620,data_analysis_lb,56.74,[],livebench_240701
211
+ 209,gpt_4o_2024_05_13,data_analysis_lb,52.41,[],livebench_240701
212
+ 210,gpt_4_turbo_2024_04_09,data_analysis_lb,51.32,[],livebench_240701
213
+ 211,gpt_4_1106_preview,data_analysis_lb,51.33,[],livebench_240701
214
+ 212,claude_3_opus_20240229,data_analysis_lb,54.32,[],livebench_240701
215
+ 213,gpt_4_0125_preview,data_analysis_lb,54.06,[],livebench_240701
216
+ 214,deepseek_coder_v2,data_analysis_lb,38.25,[],livebench_240701
217
+ 215,gemini_1.5_pro_api_0514,data_analysis_lb,52.81,[],livebench_240701
218
+ 216,gemma_2_27b_it,data_analysis_lb,43.58,[],livebench_240701
219
+ 217,gemini_1.5_flash_api_0514,data_analysis_lb,44.03,[],livebench_240701
220
+ 218,qwen2_72b_instruct,data_analysis_lb,26.24,[],livebench_240701
221
+ 219,acm_rewrite_qwen2_72b_chat,data_analysis_lb,26.19,[],livebench_240701
222
+ 220,mistral_large_2402,data_analysis_lb,42.55,[],livebench_240701
223
+ 221,deepseek_chat_v2,data_analysis_lb,38.03,[],livebench_240701
224
+ 222,claude_3_sonnet_20240229,data_analysis_lb,44.56,[],livebench_240701
225
+ 223,meta_llama_3_70b_instruct,data_analysis_lb,42.41,[],livebench_240701
226
+ 224,claude_3_haiku_20240307,data_analysis_lb,41.54,[],livebench_240701
227
+ 225,mixtral_8x22b_instruct_v0.1,data_analysis_lb,30.33,[],livebench_240701
228
+ 226,gpt_3.5_turbo_0125,data_analysis_lb,41.21,[],livebench_240701
229
+ 227,gpt_3.5_turbo_1106,data_analysis_lb,41.7,[],livebench_240701
230
+ 228,command_r_plus,data_analysis_lb,24.6,[],livebench_240701
231
+ 229,mistral_small_2402,data_analysis_lb,31.88,[],livebench_240701
232
+ 230,gemma_2_9b_it,data_analysis_lb,35.06,[],livebench_240701
233
+ 231,phi_3_medium_4k_instruct,data_analysis_lb,31.63,[],livebench_240701
234
+ 232,phi_3_medium_128k_instruct,data_analysis_lb,32.12,[],livebench_240701
235
+ 233,deepseek_coder_v2_lite_instruct,data_analysis_lb,33.0,[],livebench_240701
236
+ 234,qwen1.5_110b_chat,data_analysis_lb,31.45,[],livebench_240701
237
+ 235,qwen1.5_72b_chat,data_analysis_lb,32.98,[],livebench_240701
238
+ 236,command_r,data_analysis_lb,31.69,[],livebench_240701
239
+ 237,phi_3_small_128k_instruct,data_analysis_lb,27.33,[],livebench_240701
240
+ 238,meta_llama_3_8b_instruct,data_analysis_lb,23.33,[],livebench_240701
241
+ 239,qwen2_7b_instruct,data_analysis_lb,28.75,[],livebench_240701
242
+ 240,phi_3_small_8k_instruct,data_analysis_lb,27.5,[],livebench_240701
243
+ 241,openhermes_2.5_mistral_7b,data_analysis_lb,26.92,[],livebench_240701
244
+ 242,mixtral_8x7b_instruct_v0.1,data_analysis_lb,28.13,[],livebench_240701
245
+ 243,mistral_7b_instruct_v0.2,data_analysis_lb,14.62,[],livebench_240701
246
+ 244,phi_3_mini_4k_instruct,data_analysis_lb,14.67,[],livebench_240701
247
+ 245,zephyr_7b_alpha,data_analysis_lb,17.4,[],livebench_240701
248
+ 246,phi_3_mini_128k_instruct,data_analysis_lb,8.69,[],livebench_240701
249
+ 247,zephyr_7b_beta,data_analysis_lb,15.75,[],livebench_240701
250
+ 248,deepseek_v2_lite_chat,data_analysis_lb,18.19,[],livebench_240701
251
+ 249,qwen1.5_7b_chat,data_analysis_lb,16.23,[],livebench_240701
252
+ 250,starling_lm_7b_beta,data_analysis_lb,2.0,[],livebench_240701
253
+ 251,vicuna_7b_v1.5_16k,data_analysis_lb,9.27,[],livebench_240701
254
+ 252,vicuna_7b_v1.5,data_analysis_lb,2.67,[],livebench_240701
255
+ 253,qwen1.5_4b_chat,data_analysis_lb,9.13,[],livebench_240701
256
+ 254,llama_2_7b_chat,data_analysis_lb,0.0,[],livebench_240701
257
+ 255,qwen2_1.5b_instruct,data_analysis_lb,10.01,[],livebench_240701
258
+ 256,yi_6b_chat,data_analysis_lb,4.38,[],livebench_240701
259
+ 257,qwen2_0.5b_instruct,data_analysis_lb,2.0,[],livebench_240701
260
+ 258,qwen1.5_1.8b_chat,data_analysis_lb,3.33,[],livebench_240701
261
+ 259,qwen1.5_0.5b_chat,data_analysis_lb,0.0,[],livebench_240701
262
+ 260,claude_3_5_sonnet_20240620,language_lb,56.94,[],livebench_240701
263
+ 261,gpt_4o_2024_05_13,language_lb,53.94,[],livebench_240701
264
+ 262,gpt_4_turbo_2024_04_09,language_lb,45.26,[],livebench_240701
265
+ 263,gpt_4_1106_preview,language_lb,48.37,[],livebench_240701
266
+ 264,claude_3_opus_20240229,language_lb,51.72,[],livebench_240701
267
+ 265,gpt_4_0125_preview,language_lb,43.55,[],livebench_240701
268
+ 266,deepseek_coder_v2,language_lb,33.04,[],livebench_240701
269
+ 267,gemini_1.5_pro_api_0514,language_lb,38.25,[],livebench_240701
270
+ 268,gemma_2_27b_it,language_lb,32.4,[],livebench_240701
271
+ 269,gemini_1.5_flash_api_0514,language_lb,30.69,[],livebench_240701
272
+ 270,qwen2_72b_instruct,language_lb,29.21,[],livebench_240701
273
+ 271,acm_rewrite_qwen2_72b_chat,language_lb,30.03,[],livebench_240701
274
+ 272,mistral_large_2402,language_lb,28.74,[],livebench_240701
275
+ 273,deepseek_chat_v2,language_lb,32.29,[],livebench_240701
276
+ 274,claude_3_sonnet_20240229,language_lb,38.08,[],livebench_240701
277
+ 275,meta_llama_3_70b_instruct,language_lb,34.11,[],livebench_240701
278
+ 276,claude_3_haiku_20240307,language_lb,30.07,[],livebench_240701
279
+ 277,mixtral_8x22b_instruct_v0.1,language_lb,26.48,[],livebench_240701
280
+ 278,gpt_3.5_turbo_0125,language_lb,24.22,[],livebench_240701
281
+ 279,gpt_3.5_turbo_1106,language_lb,28.63,[],livebench_240701
282
+ 280,command_r_plus,language_lb,23.92,[],livebench_240701
283
+ 281,mistral_small_2402,language_lb,22.06,[],livebench_240701
284
+ 282,gemma_2_9b_it,language_lb,27.64,[],livebench_240701
285
+ 283,phi_3_medium_4k_instruct,language_lb,13.91,[],livebench_240701
286
+ 284,phi_3_medium_128k_instruct,language_lb,12.76,[],livebench_240701
287
+ 285,deepseek_coder_v2_lite_instruct,language_lb,10.64,[],livebench_240701
288
+ 286,qwen1.5_110b_chat,language_lb,13.22,[],livebench_240701
289
+ 287,qwen1.5_72b_chat,language_lb,11.37,[],livebench_240701
290
+ 288,command_r,language_lb,14.64,[],livebench_240701
291
+ 289,phi_3_small_128k_instruct,language_lb,12.28,[],livebench_240701
292
+ 290,meta_llama_3_8b_instruct,language_lb,18.72,[],livebench_240701
293
+ 291,qwen2_7b_instruct,language_lb,10.21,[],livebench_240701
294
+ 292,phi_3_small_8k_instruct,language_lb,14.96,[],livebench_240701
295
+ 293,openhermes_2.5_mistral_7b,language_lb,11.37,[],livebench_240701
296
+ 294,mixtral_8x7b_instruct_v0.1,language_lb,13.76,[],livebench_240701
297
+ 295,mistral_7b_instruct_v0.2,language_lb,9.05,[],livebench_240701
298
+ 296,phi_3_mini_4k_instruct,language_lb,7.1,[],livebench_240701
299
+ 297,zephyr_7b_alpha,language_lb,7.2,[],livebench_240701
300
+ 298,phi_3_mini_128k_instruct,language_lb,6.8,[],livebench_240701
301
+ 299,zephyr_7b_beta,language_lb,4.28,[],livebench_240701
302
+ 300,deepseek_v2_lite_chat,language_lb,9.2,[],livebench_240701
303
+ 301,qwen1.5_7b_chat,language_lb,6.18,[],livebench_240701
304
+ 302,starling_lm_7b_beta,language_lb,7.26,[],livebench_240701
305
+ 303,vicuna_7b_v1.5_16k,language_lb,7.92,[],livebench_240701
306
+ 304,vicuna_7b_v1.5,language_lb,8.66,[],livebench_240701
307
+ 305,qwen1.5_4b_chat,language_lb,5.8,[],livebench_240701
308
+ 306,llama_2_7b_chat,language_lb,6.86,[],livebench_240701
309
+ 307,qwen2_1.5b_instruct,language_lb,3.05,[],livebench_240701
310
+ 308,yi_6b_chat,language_lb,4.69,[],livebench_240701
311
+ 309,qwen2_0.5b_instruct,language_lb,2.8,[],livebench_240701
312
+ 310,qwen1.5_1.8b_chat,language_lb,3.16,[],livebench_240701
313
+ 311,qwen1.5_0.5b_chat,language_lb,2.88,[],livebench_240701
314
+ 312,claude_3_5_sonnet_20240620,if_lb,72.3,[],livebench_240701
315
+ 313,gpt_4o_2024_05_13,if_lb,72.17,[],livebench_240701
316
+ 314,gpt_4_turbo_2024_04_09,if_lb,71.39,[],livebench_240701
317
+ 315,gpt_4_1106_preview,if_lb,69.39,[],livebench_240701
318
+ 316,claude_3_opus_20240229,if_lb,70.87,[],livebench_240701
319
+ 317,gpt_4_0125_preview,if_lb,63.92,[],livebench_240701
320
+ 318,deepseek_coder_v2,if_lb,67.18,[],livebench_240701
321
+ 319,gemini_1.5_pro_api_0514,if_lb,67.2,[],livebench_240701
322
+ 320,gemma_2_27b_it,if_lb,67.37,[],livebench_240701
323
+ 321,gemini_1.5_flash_api_0514,if_lb,63.01,[],livebench_240701
324
+ 322,qwen2_72b_instruct,if_lb,68.27,[],livebench_240701
325
+ 323,acm_rewrite_qwen2_72b_chat,if_lb,65.0,[],livebench_240701
326
+ 324,mistral_large_2402,if_lb,68.19,[],livebench_240701
327
+ 325,deepseek_chat_v2,if_lb,64.34,[],livebench_240701
328
+ 326,claude_3_sonnet_20240229,if_lb,65.0,[],livebench_240701
329
+ 327,meta_llama_3_70b_instruct,if_lb,63.5,[],livebench_240701
330
+ 328,claude_3_haiku_20240307,if_lb,64.03,[],livebench_240701
331
+ 329,mixtral_8x22b_instruct_v0.1,if_lb,63.17,[],livebench_240701
332
+ 330,gpt_3.5_turbo_0125,if_lb,60.47,[],livebench_240701
333
+ 331,gpt_3.5_turbo_1106,if_lb,51.53,[],livebench_240701
334
+ 332,command_r_plus,if_lb,71.51,[],livebench_240701
335
+ 333,mistral_small_2402,if_lb,63.91,[],livebench_240701
336
+ 334,gemma_2_9b_it,if_lb,61.55,[],livebench_240701
337
+ 335,phi_3_medium_4k_instruct,if_lb,53.3,[],livebench_240701
338
+ 336,phi_3_medium_128k_instruct,if_lb,56.15,[],livebench_240701
339
+ 337,deepseek_coder_v2_lite_instruct,if_lb,48.34,[],livebench_240701
340
+ 338,qwen1.5_110b_chat,if_lb,55.26,[],livebench_240701
341
+ 339,qwen1.5_72b_chat,if_lb,58.25,[],livebench_240701
342
+ 340,command_r,if_lb,57.16,[],livebench_240701
343
+ 341,phi_3_small_128k_instruct,if_lb,36.88,[],livebench_240701
344
+ 342,meta_llama_3_8b_instruct,if_lb,57.14,[],livebench_240701
345
+ 343,qwen2_7b_instruct,if_lb,44.74,[],livebench_240701
346
+ 344,phi_3_small_8k_instruct,if_lb,48.24,[],livebench_240701
347
+ 345,openhermes_2.5_mistral_7b,if_lb,52.78,[],livebench_240701
348
+ 346,mixtral_8x7b_instruct_v0.1,if_lb,44.81,[],livebench_240701
349
+ 347,mistral_7b_instruct_v0.2,if_lb,51.65,[],livebench_240701
350
+ 348,phi_3_mini_4k_instruct,if_lb,40.05,[],livebench_240701
351
+ 349,zephyr_7b_alpha,if_lb,52.79,[],livebench_240701
352
+ 350,phi_3_mini_128k_instruct,if_lb,49.65,[],livebench_240701
353
+ 351,zephyr_7b_beta,if_lb,48.32,[],livebench_240701
354
+ 352,deepseek_v2_lite_chat,if_lb,41.83,[],livebench_240701
355
+ 353,qwen1.5_7b_chat,if_lb,44.12,[],livebench_240701
356
+ 354,starling_lm_7b_beta,if_lb,38.32,[],livebench_240701
357
+ 355,vicuna_7b_v1.5_16k,if_lb,42.12,[],livebench_240701
358
+ 356,vicuna_7b_v1.5,if_lb,41.75,[],livebench_240701
359
+ 357,qwen1.5_4b_chat,if_lb,27.75,[],livebench_240701
360
+ 358,llama_2_7b_chat,if_lb,44.88,[],livebench_240701
361
+ 359,qwen2_1.5b_instruct,if_lb,25.9,[],livebench_240701
362
+ 360,yi_6b_chat,if_lb,27.22,[],livebench_240701
363
+ 361,qwen2_0.5b_instruct,if_lb,26.63,[],livebench_240701
364
+ 362,qwen1.5_1.8b_chat,if_lb,22.9,[],livebench_240701
365
+ 363,qwen1.5_0.5b_chat,if_lb,21.3,[],livebench_240701