Yotam Perlitz commited on
Commit
05398d1
1 Parent(s): 0f8e886

git commit finalize app

Browse files

Signed-off-by: Yotam Perlitz <yotam.perlitz@ibm.com>

.gitignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ .vscode/launch.json
2
+ .vscode/settings.json
3
+ .DS_Store
app.py CHANGED
@@ -1,117 +1,413 @@
1
- import streamlit as st
 
 
2
  import pandas as pd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
 
4
- st.title("‎‎‎ ‎‎ ‎ ‎ ‎ ‎ ‎ ‎🏋️‍♂️ benchbench-Leaderboard 🏋️‍♂️")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  import pandas as pd
7
  from bat import Tester, Config, Benchmark, Reporter
8
  from bat.utils import get_holistic_benchmark
9
 
10
-
11
  cfg = Config(
12
  exp_to_run="example",
13
  n_models_taken_list=[0],
14
  model_select_strategy_list=["random"],
15
- n_exps=10,
16
- # reference_data_path="data/combined_holistic.csv",
17
  )
 
18
 
 
19
 
20
- newbench_name = "livebench"
21
- new_bench_agg_name = f"{newbench_name}_mwr"
22
 
 
23
  tester = Tester(cfg=cfg)
 
 
 
 
 
 
 
24
 
25
- # models_for_benchmark_scoring = tester.fetch_reference_models_names(
26
- # reference_benchmark=get_holistic_benchmark(), n_models=20
27
- # )
28
 
 
 
29
  newbench = Benchmark(
30
- pd.read_csv(f"assets/{newbench_name}.csv"),
31
  data_source=newbench_name,
32
  )
 
 
33
 
34
- # newbench.add_aggragete(new_col_name=new_bench_agg_name)
35
- # newbench_agreements = tester.all_vs_all_agreement_testing(newbench)
36
 
 
 
 
 
37
  reporter = Reporter()
38
- # reporter.draw_agreements(
39
- # newbench_agreements, ref_sources=[newbench_name], scenario_sources=[newbench_name]
40
- # )
 
41
 
42
- holistic = get_holistic_benchmark()
43
- holistic.add_aggragete(new_col_name="aggregate", agg_source_name="holistic")
44
 
45
- allbench = newbench.extend(holistic)
 
46
  allbench.clear_repeated_scenarios(source_to_keep=newbench_name)
 
 
 
47
 
 
48
 
49
- @st.cache_data
50
- def run_load():
51
- return tester.all_vs_all_agreement_testing(allbench)
52
-
53
-
54
- all_agreements = run_load()
55
-
56
- observed_scenario = "arena_elo" # "livebench_lb"
57
- blacklist_sources = [] # "livebench"
58
-
59
- z_score = reporter.get_z_score(all_agreements, observed_scenario, blacklist_sources)
60
-
61
- st.write(f"zscore of {observed_scenario}: {z_score}")
62
-
63
- # df = pd.read_csv("BAT_w_arena_10_random.csv")
64
- # df = (
65
- # (
66
- # df.rename(
67
- # columns={
68
- # "z_score": "Z_Score",
69
- # "benchmark": "Benchmark",
70
- # }
71
- # ).drop(
72
- # columns=[
73
- # "Unnamed: 0",
74
- # "z_test_pass",
75
- # ]
76
- # )
77
- # )
78
- # .sort_values("Z_Score", ascending=False)
79
- # .query(
80
- # 'Benchmark!="Aggregate" and Benchmark!="MAGI" and Benchmark!="Alpaca(v2, len adj)" and Benchmark!="GPT4All"'
81
- # )
82
- # )
83
-
84
-
85
- # df.replace(
86
- # {
87
- # "Arena Elo": "LMSys Arena",
88
- # "Hugging-6": "HF OpenLLM",
89
- # "Alpaca(v2)": "Alpaca v2",
90
- # "Alpaca(v1)": "Alpaca v1",
91
- # "EQ-Bench(v2)": "EQ-Bench v2",
92
- # },
93
- # inplace=True,
94
- # )
95
-
96
- # col1, col2, col3 = st.columns(3)
97
-
98
- # with col1:
99
- # st.header("‎ ‎ ‎ ‎ ‎ ‎ ‎ ‎ Agree")
100
- # st.dataframe(df.query("Z_Score>=0"), hide_index=True)
101
-
102
- # with col2:
103
- # st.header("‎ ‎‎ ‎ Disagree")
104
- # st.dataframe(df.query("Z_Score<0").sort_values("Z_Score"), hide_index=True)
105
-
106
- # with col3:
107
- # st.header("‎ ‎‎ ‎ Configs")
108
- # # st.selectbox(label="Reference Benchmarks", options=["LMSys Arena"])
109
- # options = st.multiselect(
110
- # "Reference Benchmarks",
111
- # ["LMSys Arena", "Open Compass", "Yellow", "Red", "Blue"],
112
- # ["LMSys Arena", "Open Compass"],
113
- # )
114
- # st.selectbox(label="# models compared", options=[20])
115
- # st.selectbox(label="Model Select Strategy", options=["Random"])
116
- # st.write("‎‎‎‎‎‎‎")
117
- # st.button("Upload a new benchmark")
 
1
+ import hashlib
2
+ import os
3
+
4
  import pandas as pd
5
+ import plotly.express as px
6
+ import streamlit as st
7
+ from bat import Benchmark, Config, Reporter, Tester
8
+ from bat.utils import get_holistic_benchmark
9
+
10
+ benchmarks_dict = {
11
+ "arena_elo": "LMSys Arena",
12
+ "mt_bench": "MT Bench",
13
+ "mixeval": "Mix Eval",
14
+ "alpacav2": "AlpacaEval V2",
15
+ "arena_hard": "Arena Hard",
16
+ "arc_c": "ARC-C",
17
+ "eq_benchv2": "EQ Bench V2",
18
+ "agieval": "AGIEval",
19
+ "llmonitor": "LLMonitor",
20
+ "bbh": "BBH",
21
+ "mmlu": "MMLU",
22
+ "alpacav1": "AlpacaEval V1",
23
+ "magi": "MAGI",
24
+ "alpacaeval2_lc": "AlpacaEval V2 Length Adjusted",
25
+ "gpt4all": "GPT-4-All",
26
+ "humaneval": "HumanEval",
27
+ "mbpp": "MBPP",
28
+ "hellaswag": "HellaSwag",
29
+ "hugging_6": "HF OpenLLM V1",
30
+ "winogrande": "Winogrande",
31
+ }
32
+
33
+ st.markdown(
34
+ """<h1 style='text-align: center; color: black;'>🏋️‍♂️ BenchBench Leaderboard 🏋️‍♂️</h1>""",
35
+ unsafe_allow_html=True,
36
+ )
37
+
38
+ st.markdown(
39
+ "We are excited to share the BenchBench-Leaderboard, a crucial component of our comprehensive research on Benchmark Agreement Testing (BAT) [work](#). "
40
+ "This leaderboard is a meta-benchmark that ranks benchmarks based on their agreement with the crowd harnessing many different references. "
41
+ )
42
+
43
+
44
+ all_scenarios_for_aggragate = get_holistic_benchmark().get_scenarios()
45
+
46
+ st.subheader("The Leaderboard", divider=True)
47
+ # st.subheader("🏋️‍♂️ BenchBench Leaderboard 🏋", divider=True)
48
+
49
+ leftcol, rightcol = st.columns([2, 1])
50
+ with leftcol:
51
+ with st.expander("Leaderboard configurations (defaults are great BTW)", icon="⚙️"):
52
+ with st.form("my_form"):
53
+ all_scenarios_for_aggragate_with_all = all_scenarios_for_aggragate.tolist()
54
+ all_scenarios_for_aggragate_with_all.append("All Holistic")
55
+
56
+ aggragate_scenarios = st.multiselect(
57
+ "Scenarios in Aggregate",
58
+ all_scenarios_for_aggragate_with_all,
59
+ ["All Holistic"],
60
+ # all_scenarios_for_aggragate,
61
+ )
62
+
63
+ corr_type = st.selectbox(
64
+ label="Select Correlation type", options=["kendall", "pearson"], index=0
65
+ )
66
+
67
+ aggragate_scenario_blacklist = (
68
+ [
69
+ scen
70
+ for scen in all_scenarios_for_aggragate
71
+ if scen not in aggragate_scenarios
72
+ ]
73
+ if "All Holistic" not in aggragate_scenarios
74
+ else []
75
+ )
76
+
77
+ model_select_strategy = st.selectbox(
78
+ label="Select strategy",
79
+ options=["random", "top_aggregate", "somewhere_aggregate"],
80
+ index=0,
81
+ )
82
+
83
+ n_models_taken_list = [5]
84
+ n_exps = 10
85
+
86
+ submitted = st.form_submit_button(label="Run BAT")
87
+
88
+ with rightcol:
89
+ st.button("➕ Add your benchmark here!")
90
+
91
+
92
+ def run_load(
93
+ aggragate_scenario_blacklist=[],
94
+ n_models_taken_list=[5],
95
+ model_select_strategy_list=["random"],
96
+ corr_types=["kendall"],
97
+ n_exps=10,
98
+ ):
99
+ # Create a hash of the inputs to generate a unique cache file for each set of inputs
100
+ input_str = (
101
+ str(aggragate_scenario_blacklist)
102
+ + str(n_models_taken_list)
103
+ + str(model_select_strategy_list)
104
+ + str(corr_types)
105
+ + str(n_exps)
106
+ )
107
+ input_hash = hashlib.md5(input_str.encode()).hexdigest()
108
+ cache_file = f"agreements_cache_{input_hash}.csv"
109
+
110
+ # Define the cache directory
111
+ cache_dir = "cache"
112
+ cache_path = os.path.join(cache_dir, cache_file)
113
+
114
+ # Check if the cache file exists
115
+ if os.path.exists(cache_path):
116
+ print("Loading cached results...")
117
+ agreements = pd.read_csv(cache_path)
118
+ return agreements
119
+
120
+ else:
121
+ print("Cached results not found, calculating")
122
+
123
+ cfg = Config(
124
+ exp_to_run="example",
125
+ n_models_taken_list=n_models_taken_list,
126
+ model_select_strategy_list=model_select_strategy_list,
127
+ corr_types=corr_types,
128
+ n_exps=n_exps if n_models_taken_list != [0] else 1,
129
+ # reference_data_path="data/combined_holistic.csv",
130
+ )
131
+
132
+ holistic = get_holistic_benchmark()
133
+ holistic_scenarios = holistic.get_scenarios()
134
+ holistic.clear_repeated_scenarios()
135
+ holistic.add_aggragete(
136
+ new_col_name="aggregate",
137
+ agg_source_name="holistic",
138
+ scenario_blacklist=aggragate_scenario_blacklist,
139
+ min_scenario_for_models_to_appear_in_agg=5,
140
+ )
141
+
142
+ allbench = Benchmark(
143
+ pd.read_csv("assets/combined_20240704.csv"),
144
+ # data_source=newbench_name,
145
+ )
146
+ allbench.df = allbench.df.drop(columns=["tag"])
147
+ allbench.clear_repeated_scenarios()
148
+ allbench.df = allbench.df.query("scenario not in @holistic_scenarios")
149
+
150
+ allbench.df = allbench.df[~allbench.df["scenario"].str.contains("_mixed")]
151
+ allbench.df = allbench.df[~allbench.df["scenario"].str.contains("agentbench")]
152
+
153
+ # st.dataframe(holistic.df.query('scenario=="aggregate"'))
154
+
155
+ allbench = allbench.extend(holistic)
156
+
157
+ tester = Tester(cfg=cfg)
158
+
159
+ # len(allbench.get_scenario_appearences_count().keys())
160
+
161
+ agreements = tester.all_vs_all_agreement_testing(
162
+ allbench, single_source_scenario="aggregate"
163
+ )
164
+
165
+ agreements.to_csv(cache_path, index=False)
166
+
167
+ return agreements
168
+
169
+
170
+ agreements = run_load(
171
+ aggragate_scenario_blacklist=aggragate_scenario_blacklist,
172
+ n_models_taken_list=n_models_taken_list,
173
+ model_select_strategy_list=[model_select_strategy],
174
+ corr_types=[corr_type],
175
+ n_exps=n_exps,
176
+ )
177
+
178
+ reporter = Reporter()
179
+ z_scores = reporter.get_all_z_scores(agreements=agreements, aggragate_name="aggregate")
180
+
181
+ corr_name = f"{'KT' if corr_type=='kendall' else 'Per.'} Corr."
182
+
183
+ z_scores["z_score"] = z_scores["z_score"].round(2)
184
+ z_scores["corr_with_agg"] = z_scores["corr_with_agg"].round(2)
185
+ z_scores["p_value_of_corr_with_agg"] = z_scores["p_value_of_corr_with_agg"].round(2)
186
+
187
+ data = (
188
+ z_scores.rename(
189
+ columns={
190
+ "scenario": "Benchmark",
191
+ "z_score": "Z Score",
192
+ "corr_with_agg": corr_name,
193
+ "p_value_of_corr_with_agg": "p value of Corr.",
194
+ "source": "Source",
195
+ }
196
+ )
197
+ .sort_values("Z Score", ascending=False)
198
+ .reset_index(drop=True)
199
+ )
200
+
201
+
202
+ data = data[~data["Source"].str.contains("livebench")]
203
+ data = data[~data["Source"].str.contains("biggen")]
204
+ data.drop(columns=["Source"], inplace=True)
205
+ data["Benchmark"] = data["Benchmark"].apply(lambda x: benchmarks_dict[x])
206
+
207
+ # Apply coloring based on 'Z' valuesz
208
+
209
+ styled_data = data.style.background_gradient(
210
+ subset=["Z Score"],
211
+ cmap="RdYlGn",
212
+ vmin=-data["Z Score"].abs().max(),
213
+ vmax=data["Z Score"].abs().max(),
214
+ ).format(subset=["Z Score", corr_name, "p value of Corr."], formatter="{:.2}")
215
+
216
+ st.dataframe(
217
+ data=styled_data,
218
+ hide_index=True,
219
+ use_container_width=True,
220
+ height=300,
221
+ )
222
+
223
+ st.markdown(
224
+ "BenchBench-Leaderboard complements our study, where we analyzed over 40 prominent benchmarks and introduced standardized practices to enhance the robustness and validity of benchmark evaluations through the [BenchBench Python package](#). "
225
+ "The BenchBench-Leaderboard serves as a dynamic platform for benchmark comparison and is an essential tool for researchers and practitioners in the language model field aiming to select and utilize benchmarks effectively. "
226
+ )
227
+
228
+ st.subheader("How did we get the Z Scores?", divider=True)
229
+
230
+ st.write(r"""
231
+ Section 3.1 in our work shows how using a single reference benchmark drastically hurts the roubustness and validity of BAT.
232
+ To remedy this, we propose to test benchmark agreement with an aggragate benchmark and compare the agreement to other benchmarks.
233
+ We recommend to perform this comparison using the [Z score](https://en.wikipedia.org/wiki/Standard_score) and demonstrate obtaining it to a benchmark of your selection.
234
+ In the follwing way: $z_i=(x_i-\mu_{i...N}) / \sigma_{i...N}$ where $x_i$ is the agreement of the $i$th benchmark to the aggragate and $\mu_{i...N}$,$\sigma_{i...N}$ are the
235
+ mean and standard deviation of the agreements of the other benchmarks to the aggragate.
236
+ """)
237
+
238
+
239
+ benchmarks = data["Benchmark"].unique().tolist()
240
+ plotted_scenario = st.selectbox(
241
+ "Choose Benchmark to plot", benchmarks, index=benchmarks.index("LMSys Arena")
242
+ )
243
+
244
+
245
+ fig = px.histogram(
246
+ data.query("Benchmark!=@plotted_scenario"), x=corr_name, nbins=len(data) - 1
247
+ )
248
+ # Add a vertical line at a specific x-coordinate
249
+ # Replace 'x_value' with the actual value where you want the line
250
+ x_value = 0.5 # Example value, adjust as necessary
251
+ fig.add_vline(
252
+ x=data.query("Benchmark==@plotted_scenario")[corr_name].iloc[0],
253
+ line_dash="dash",
254
+ line_color="red",
255
+ )
256
+ # Update layout to add a title
257
+ fig.update_layout(
258
+ title="Histogram of Correlation Values", # Change the title text as needed
259
+ title_x=0.3, # Centers the title
260
+ title_font=dict(size=20, family="CMU"), # Customize font if needed
261
+ )
262
+
263
+ # # Plot!
264
+ st.plotly_chart(fig, use_container_width=True)
265
+
266
+ st.subheader("Why should you use the BenchBench Leaderboard?")
267
+
268
+ st.markdown(
269
+ """
270
+
271
+ Current practices in Benchmark Agreement Testing (BAT) often suffer from a lack of standardization and transparency, which can lead to inconsistent results and diminished trust in benchmark evaluations. Several key issues are prevalent in the field:
272
+
273
+ """
274
+ )
275
+
276
+ st.markdown(
277
+ """
278
+ - **Lack of Standard Methodologies:** Unlike other scientific procedures that follow rigorous methodologies, BAT lacks uniform procedures across different studies. Researchers often employ varied criteria for selecting benchmarks and models for comparison, which leads to results that cannot be easily compared or replicated. This variation undermines the reliability of conclusions drawn from BAT and makes it difficult for other researchers to build on existing work.
279
+ """
280
+ )
281
+
282
+ st.image(
283
+ "assets/motivation.png",
284
+ caption="Conclusions depend on the models considered. Kendall-tau correlations between the LMSys Arena benchmark and three other benchmarks: BBH, MMLU, and Alpaca v2. Each group of bars represents the correlation for different sets of top models, specifically the top 5, top 10, and top 15 (overlapping) models (according to the Arena). The results indicate that the degree of agreement between benchmarks varies with the number of top models considered, highlighting that different selections of models can lead to varying conclusions about benchmark agreement.",
285
+ use_column_width=True,
286
+ )
287
+
288
+ st.markdown(
289
+ """
290
+ - **Arbitrary Selection of Reference Benchmarks:** One of the most critical decisions in BAT is the choice of reference benchmarks. Currently, this choice is often arbitrary and lacks a clear rationale, influenced by availability or personal preference rather than strategic alignment with the benchmark’s purpose. This can skew the results significantly, as different benchmarks may not be equally representative or relevant to the models being tested.
291
+ """
292
+ )
293
+ st.markdown(
294
+ """
295
+ - **Inadequate Model Representation:** BAT frequently relies on a limited subset of models, which may not comprehensively represent the diversity of architectures and training paradigms in modern language models. This selective representation can lead to biased agreement scores that favor certain types of models over others, failing to provide a holistic view of model performance across different benchmarks.
296
+ """
297
+ )
298
 
299
+ st.image(
300
+ "assets/pointplot_granularity_matters.png",
301
+ caption="Correlations increase with number of models. Mean correlation (y) between each benchmark (lines) and the rest, given different numbers of models. The Blue and Orange lines are the average of all benchmark pair correlations with models sampled randomly (orange) or in contiguous sets (blue). The shaded lines represents adjacent sampling for the different benchmarks.",
302
+ use_column_width=True,
303
+ )
304
+
305
+ st.markdown(
306
+ """
307
+ - **Overemphasis on Correlation Metrics:** Current BAT practices tend to over-rely on correlation metrics without adequately considering their limitations and the context of their application. While these metrics can provide useful insights, they are often treated as definitive evidence of agreement without acknowledging that high correlation does not necessarily imply conceptual alignment between benchmarks.
308
+ """
309
+ )
310
+
311
+ st.markdown(
312
+ """
313
+ To address these issues, there is a critical need for a more structured approach to BAT that includes clear guidelines for benchmark and model selection, a broader consideration of agreement metrics, and an acknowledgment of the evolving nature of technology in this space. By reforming BAT practices, the research community can improve the reliability and utility of benchmarks as tools for evaluating and advancing language models.
314
+ """
315
+ )
316
+
317
+
318
+ st.image(
319
+ "assets/ablations.png",
320
+ caption="Our recommendations substantially reduce the variance of BAT. Ablation analysis for each BAT recommendation separately and their combinations.",
321
+ use_column_width=True,
322
+ )
323
+
324
+
325
+ st.header("The BenchBench package")
326
+
327
+ st.markdown("""
328
+ ### Overview
329
+
330
+ The BAT package is designed to facilitate benchmark agreement testing for NLP models. It allows users to easily compare multiple models against various benchmarks and generate comprehensive reports on their agreement.
331
+
332
+ ### Installation
333
 
334
+ To install the BAT package, you can use pip:
335
+
336
+ ```
337
+ pip install bat-package
338
+ ```
339
+
340
+ ### Usage Example
341
+
342
+ Below is a step-by-step example of how to use the BAT package to perform agreement testing.
343
+
344
+ #### Step 1: Configuration
345
+
346
+ First, set up the configuration for the tests:
347
+
348
+ ```python
349
  import pandas as pd
350
  from bat import Tester, Config, Benchmark, Reporter
351
  from bat.utils import get_holistic_benchmark
352
 
 
353
  cfg = Config(
354
  exp_to_run="example",
355
  n_models_taken_list=[0],
356
  model_select_strategy_list=["random"],
357
+ n_exps=10
 
358
  )
359
+ ```
360
 
361
+ #### Step 2: Fetch Model Names
362
 
363
+ Fetch the names of the reference models to be used for scoring:
 
364
 
365
+ ```python
366
  tester = Tester(cfg=cfg)
367
+ models_for_benchmark_scoring = tester.fetch_reference_models_names(
368
+ reference_benchmark=get_holistic_benchmark(), n_models=20
369
+ )
370
+ print(models_for_benchmark_scoring)
371
+ ```
372
+
373
+ #### Step 3: Load and Prepare Benchmark
374
 
375
+ Load a new benchmark and add an aggregate column:
 
 
376
 
377
+ ```python
378
+ newbench_name = "fakebench"
379
  newbench = Benchmark(
380
+ pd.read_csv(f"src/bat/assets/{newbench_name}.csv"),
381
  data_source=newbench_name,
382
  )
383
+ newbench.add_aggregate(new_col_name=f"{newbench_name}_mwr")
384
+ ```
385
 
386
+ #### Step 4: Agreement Testing
 
387
 
388
+ Perform all-vs-all agreement testing on the new benchmark:
389
+
390
+ ```python
391
+ newbench_agreements = tester.all_vs_all_agreement_testing(newbench)
392
  reporter = Reporter()
393
+ reporter.draw_agreements(newbench_agreements)
394
+ ```
395
+
396
+ #### Step 5: Extend and Clean Benchmark
397
 
398
+ Extend the new benchmark with holistic data and clear repeated scenarios:
 
399
 
400
+ ```python
401
+ allbench = newbench.extend(get_holistic_benchmark())
402
  allbench.clear_repeated_scenarios(source_to_keep=newbench_name)
403
+ ```
404
+
405
+ #### Step 6: Comprehensive Agreement Testing
406
 
407
+ Perform comprehensive agreement testing and visualize:
408
 
409
+ ```python
410
+ all_agreements = tester.all_vs_all_agreement_testing(allbench)
411
+ reporter.draw_agreements(all_agreements)
412
+ ```
413
+ """)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
assets/ablations.png ADDED
assets/combined_holistic.csv DELETED
@@ -1,825 +0,0 @@
1
- ,model,score,scenario,source,aggragated_from
2
- 0,gpt-4-turbo-2024-04-09,82.6,arena-hard,arena_hard_2404,[]
3
- 1,gpt-4-0125-preview,78.0,arena-hard,arena_hard_2404,[]
4
- 2,gemini-1.5-pro-api-preview,72.0,arena-hard,arena_hard_2404,[]
5
- 3,yi-large,63.7,arena-hard,arena_hard_2404,[]
6
- 4,claude-3-opus-20240229,60.4,arena-hard,arena_hard_2404,[]
7
- 5,glm-4,55.7,arena-hard,arena_hard_2404,[]
8
- 6,gpt-4-0314,50.0,arena-hard,arena_hard_2404,[]
9
- 7,gemini-1.5-flash-api-preview,49.6,arena-hard,arena_hard_2404,[]
10
- 8,claude-3-sonnet-20240229,46.8,arena-hard,arena_hard_2404,[]
11
- 9,claude-3-haiku-20240307,41.5,arena-hard,arena_hard_2404,[]
12
- 10,llama-3-70b-chat-hf,41.1,arena-hard,arena_hard_2404,[]
13
- 11,gpt-4-0613,37.9,arena-hard,arena_hard_2404,[]
14
- 12,mistral-large-2402,37.7,arena-hard,arena_hard_2404,[]
15
- 13,mixtral-8x22b-instruct-v0.1,36.4,arena-hard,arena_hard_2404,[]
16
- 14,qwen1.5-72b-chat,36.1,arena-hard,arena_hard_2404,[]
17
- 15,command-r-plus,33.1,arena-hard,arena_hard_2404,[]
18
- 16,mistral-medium,31.9,arena-hard,arena_hard_2404,[]
19
- 17,mistral-next,27.4,arena-hard,arena_hard_2404,[]
20
- 18,gpt-3.5-turbo-0613,24.8,arena-hard,arena_hard_2404,[]
21
- 19,claude-2.0,24.0,arena-hard,arena_hard_2404,[]
22
- 20,dbrx-instructruct,23.9,arena-hard,arena_hard_2404,[]
23
- 21,mixtral-8x7b-instruct-v0.1,23.4,arena-hard,arena_hard_2404,[]
24
- 22,gpt-3.5-turbo-0125,23.3,arena-hard,arena_hard_2404,[]
25
- 23,yi-34b-chat,23.1,arena-hard,arena_hard_2404,[]
26
- 24,starling-lm-7b-beta,23.0,arena-hard,arena_hard_2404,[]
27
- 25,claude-2.1,22.8,arena-hard,arena_hard_2404,[]
28
- 26,snorkel-mistral-pairrm-dpo,20.7,arena-hard,arena_hard_2404,[]
29
- 27,llama-3-8b-chat-hf,20.6,arena-hard,arena_hard_2404,[]
30
- 28,gpt-3.5-turbo-1106,18.9,arena-hard,arena_hard_2404,[]
31
- 29,gpt-3.5-turbo-0301,18.1,arena-hard,arena_hard_2404,[]
32
- 30,gemini-1.0-pro,17.8,arena-hard,arena_hard_2404,[]
33
- 31,snowflake-arctic-instruct,17.6,arena-hard,arena_hard_2404,[]
34
- 32,command-r,17.0,arena-hard,arena_hard_2404,[]
35
- 33,phi-3-mini-128k-instruct,15.4,arena-hard,arena_hard_2404,[]
36
- 34,tulu-2-dpo-70b,15.0,arena-hard,arena_hard_2404,[]
37
- 35,starling-lm-7b-alpha,12.8,arena-hard,arena_hard_2404,[]
38
- 36,mistral-7b-instruct,12.6,arena-hard,arena_hard_2404,[]
39
- 37,gemma-1.1-7b-it,12.1,arena-hard,arena_hard_2404,[]
40
- 38,llama-2-70b-chat-hf,11.6,arena-hard,arena_hard_2404,[]
41
- 39,vicuna-33b-v1.3,8.6,arena-hard,arena_hard_2404,[]
42
- 40,gemma-7b-it,7.5,arena-hard,arena_hard_2404,[]
43
- 41,llama-2-7b-chat-hf,4.6,arena-hard,arena_hard_2404,[]
44
- 42,gemma-1.1-2b-it,3.4,arena-hard,arena_hard_2404,[]
45
- 43,gemma-2b-it,3.0,arena-hard,arena_hard_2404,[]
46
- 0,gpt-4o-2024-05-13,64.7,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
47
- 1,claude-3-opus,63.5,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
48
- 2,gpt-4-turbo-2024-04-09,62.6,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
49
- 3,gemini-1.5-pro-api-0409,58.7,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
50
- 4,yi-large-preview,56.8,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
51
- 5,llama-3-70b-instruct,55.9,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
52
- 6,qwen-max-0428,55.8,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
53
- 7,claude-3-sonnet,54.0,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
54
- 8,reka-core-20240415,52.9,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
55
- 9,mammoth2-8x7b-plus,51.8,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
56
- 10,deepseek-v2,51.7,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
57
- 11,command-r-plus,51.4,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
58
- 12,yi-1.5-34b-chat,51.2,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
59
- 13,mistral-large,50.3,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
60
- 14,qwen1.5-72b-chat,48.3,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
61
- 15,mistral-medium,47.8,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
62
- 16,gemini-1.0-pro,46.4,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
63
- 17,reka-flash-20240226,46.2,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
64
- 18,mistral-small,46.2,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
65
- 19,llama-3-8b-instruct,45.6,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
66
- 20,command-r,45.2,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
67
- 21,qwen1.5-32b-chat,43.3,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
68
- 22,gpt-3.5-turbo-0125,43.0,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
69
- 23,claude-3-haiku,42.8,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
70
- 24,yi-34b-chat,42.6,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
71
- 25,mixtral-8x7b-instruct-v0.1,42.5,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
72
- 26,starling-lm-7b-beta,41.8,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
73
- 27,yi-1.5-9b-chat,40.9,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
74
- 28,gemma-1.1-7b-it,39.1,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
75
- 29,vicuna-33b-v1.3,38.7,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
76
- 30,llama-2-70b-chat,38.0,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
77
- 31,map-neo-instruct-v0.1,37.8,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
78
- 32,mistral-7b-instruct-v0.2,36.2,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
79
- 33,qwen1.5-7b-chat,35.5,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
80
- 34,reka-edge-20240208,32.2,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
81
- 35,zephyr-7b-beta,31.6,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
82
- 36,llama-2-7b-chat,30.8,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
83
- 37,yi-6b-chat,30.1,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
84
- 38,qwen1.5-moe-a2.7b-chat,29.1,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
85
- 39,gemma-1.1-2b-it,28.4,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
86
- 40,vicuna-7b-v1.5,27.8,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
87
- 41,olmo-7b-instruct,26.7,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
88
- 42,qwen1.5-4b-chat,24.6,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
89
- 43,jetmoe-8b-chat,24.3,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
90
- 44,mpt-7b-chat,23.8,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
91
- 45,llama-3-70b,54.0,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
92
- 46,qwen1.5-72b,41.9,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
93
- 47,yi-34b,47.2,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
94
- 48,qwen1.5-32b,41.0,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
95
- 49,mixtral-8x7b,40.7,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
96
- 50,llama-2-70b,41.6,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
97
- 51,qwen1.5-moe-a2.7b,33.5,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
98
- 52,qwen1.5-7b,33.7,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
99
- 53,llama-3-8b,31.7,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
100
- 54,mistral-7b,27.1,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
101
- 55,gemma-7b,32.7,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
102
- 56,yi-6b,30.4,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
103
- 57,qwen1.5-4b,23.5,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
104
- 58,jetmoe-8b,27.0,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
105
- 59,deepseek-7b,21.7,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
106
- 60,phi-2,21.9,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
107
- 61,deepseekmoe-16b,24.2,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
108
- 62,llama-2-7b,22.1,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
109
- 63,gemma-2b,22.6,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
110
- 64,olmo-7b,21.2,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
111
- 65,mpt-7b,17.4,mixeval-hard,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
112
- 66,gpt-4o-2024-05-13,87.9,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
113
- 67,claude-3-opus,88.1,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
114
- 68,gpt-4-turbo-2024-04-09,88.8,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
115
- 69,gemini-1.5-pro-api-0409,84.2,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
116
- 70,yi-large-preview,84.4,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
117
- 71,llama-3-70b-instruct,84.0,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
118
- 72,qwen-max-0428,86.1,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
119
- 73,claude-3-sonnet,81.7,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
120
- 74,reka-core-20240415,83.3,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
121
- 75,mammoth2-8x7b-plus,81.5,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
122
- 76,deepseek-v2,83.7,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
123
- 77,command-r-plus,81.5,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
124
- 78,yi-1.5-34b-chat,81.7,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
125
- 79,mistral-large,84.2,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
126
- 80,qwen1.5-72b-chat,84.1,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
127
- 81,mistral-medium,81.9,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
128
- 82,gemini-1.0-pro,78.9,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
129
- 83,reka-flash-20240226,79.8,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
130
- 84,mistral-small,81.2,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
131
- 85,llama-3-8b-instruct,75.0,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
132
- 86,command-r,77.0,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
133
- 87,qwen1.5-32b-chat,81.0,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
134
- 88,gpt-3.5-turbo-0125,79.7,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
135
- 89,claude-3-haiku,79.7,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
136
- 90,yi-34b-chat,80.1,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
137
- 91,mixtral-8x7b-instruct-v0.1,76.4,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
138
- 92,starling-lm-7b-beta,74.8,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
139
- 93,yi-1.5-9b-chat,74.2,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
140
- 94,gemma-1.1-7b-it,69.6,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
141
- 95,vicuna-33b-v1.3,66.3,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
142
- 96,llama-2-70b-chat,74.6,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
143
- 97,map-neo-instruct-v0.1,70.0,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
144
- 98,mistral-7b-instruct-v0.2,70.0,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
145
- 99,qwen1.5-7b-chat,71.4,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
146
- 100,reka-edge-20240208,68.5,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
147
- 101,zephyr-7b-beta,69.1,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
148
- 102,llama-2-7b-chat,61.7,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
149
- 103,yi-6b-chat,65.6,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
150
- 104,qwen1.5-moe-a2.7b-chat,69.1,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
151
- 105,gemma-1.1-2b-it,51.9,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
152
- 106,vicuna-7b-v1.5,60.3,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
153
- 107,olmo-7b-instruct,55.0,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
154
- 108,qwen1.5-4b-chat,57.2,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
155
- 109,jetmoe-8b-chat,51.6,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
156
- 110,mpt-7b-chat,43.8,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
157
- 111,llama-3-70b,82.2,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
158
- 112,qwen1.5-72b,79.5,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
159
- 113,yi-34b,78.3,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
160
- 114,qwen1.5-32b,77.6,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
161
- 115,mixtral-8x7b,74.0,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
162
- 116,llama-2-70b,73.2,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
163
- 117,qwen1.5-moe-a2.7b,70.2,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
164
- 118,qwen1.5-7b,68.2,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
165
- 119,llama-3-8b,65.1,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
166
- 120,mistral-7b,64.8,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
167
- 121,gemma-7b,64.7,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
168
- 122,yi-6b,63.1,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
169
- 123,qwen1.5-4b,58.2,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
170
- 124,jetmoe-8b,57.1,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
171
- 125,deepseek-7b,52.2,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
172
- 126,phi-2,51.9,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
173
- 127,deepseekmoe-16b,51.4,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
174
- 128,llama-2-7b,43.1,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
175
- 129,gemma-2b,38.9,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
176
- 130,olmo-7b,31.8,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
177
- 131,mpt-7b,30.8,mixeval,mixeval_240601,"['triviaqa-mixed', 'mmlu-mixed', 'drop-mixed', 'hellaswag-mixed', 'commonsenseqa-mixed', 'triviaqa-hard-mixed', 'mmlu-hard-mixed', 'drop-hard-mixed', 'boolq-mixed']"
178
- 264,gpt-4o-2024-05-13,85.4,mmlu-mixed,mixeval_240601,[]
179
- 265,claude-3-opus,83.2,mmlu-mixed,mixeval_240601,[]
180
- 266,gpt-4-turbo-2024-04-09,82.8,mmlu-mixed,mixeval_240601,[]
181
- 267,gemini-1.5-pro-api-0409,79.2,mmlu-mixed,mixeval_240601,[]
182
- 268,yi-large-preview,80.9,mmlu-mixed,mixeval_240601,[]
183
- 269,llama-3-70b-instruct,80.5,mmlu-mixed,mixeval_240601,[]
184
- 270,qwen-max-0428,80.6,mmlu-mixed,mixeval_240601,[]
185
- 271,claude-3-sonnet,74.7,mmlu-mixed,mixeval_240601,[]
186
- 272,reka-core-20240415,79.3,mmlu-mixed,mixeval_240601,[]
187
- 273,mammoth2-8x7b-plus,74.5,mmlu-mixed,mixeval_240601,[]
188
- 274,deepseek-v2,77.3,mmlu-mixed,mixeval_240601,[]
189
- 275,command-r-plus,78.9,mmlu-mixed,mixeval_240601,[]
190
- 276,yi-1.5-34b-chat,76.4,mmlu-mixed,mixeval_240601,[]
191
- 277,mistral-large,80.2,mmlu-mixed,mixeval_240601,[]
192
- 278,qwen1.5-72b-chat,80.1,mmlu-mixed,mixeval_240601,[]
193
- 279,mistral-medium,76.3,mmlu-mixed,mixeval_240601,[]
194
- 280,gemini-1.0-pro,74.9,mmlu-mixed,mixeval_240601,[]
195
- 281,reka-flash-20240226,75.4,mmlu-mixed,mixeval_240601,[]
196
- 282,mistral-small,75.2,mmlu-mixed,mixeval_240601,[]
197
- 283,llama-3-8b-instruct,71.9,mmlu-mixed,mixeval_240601,[]
198
- 284,command-r,75.0,mmlu-mixed,mixeval_240601,[]
199
- 285,qwen1.5-32b-chat,78.0,mmlu-mixed,mixeval_240601,[]
200
- 286,gpt-3.5-turbo-0125,74.5,mmlu-mixed,mixeval_240601,[]
201
- 287,claude-3-haiku,76.1,mmlu-mixed,mixeval_240601,[]
202
- 288,yi-34b-chat,73.6,mmlu-mixed,mixeval_240601,[]
203
- 289,mixtral-8x7b-instruct-v0.1,72.0,mmlu-mixed,mixeval_240601,[]
204
- 290,starling-lm-7b-beta,69.0,mmlu-mixed,mixeval_240601,[]
205
- 291,yi-1.5-9b-chat,72.6,mmlu-mixed,mixeval_240601,[]
206
- 292,gemma-1.1-7b-it,66.9,mmlu-mixed,mixeval_240601,[]
207
- 293,vicuna-33b-v1.3,59.2,mmlu-mixed,mixeval_240601,[]
208
- 294,llama-2-70b-chat,69.8,mmlu-mixed,mixeval_240601,[]
209
- 295,map-neo-instruct-v0.1,66.7,mmlu-mixed,mixeval_240601,[]
210
- 296,mistral-7b-instruct-v0.2,67.3,mmlu-mixed,mixeval_240601,[]
211
- 297,qwen1.5-7b-chat,68.7,mmlu-mixed,mixeval_240601,[]
212
- 298,reka-edge-20240208,63.6,mmlu-mixed,mixeval_240601,[]
213
- 299,zephyr-7b-beta,64.9,mmlu-mixed,mixeval_240601,[]
214
- 300,llama-2-7b-chat,59.4,mmlu-mixed,mixeval_240601,[]
215
- 301,yi-6b-chat,65.4,mmlu-mixed,mixeval_240601,[]
216
- 302,qwen1.5-moe-a2.7b-chat,69.5,mmlu-mixed,mixeval_240601,[]
217
- 303,gemma-1.1-2b-it,51.5,mmlu-mixed,mixeval_240601,[]
218
- 304,vicuna-7b-v1.5,58.7,mmlu-mixed,mixeval_240601,[]
219
- 305,olmo-7b-instruct,57.1,mmlu-mixed,mixeval_240601,[]
220
- 306,qwen1.5-4b-chat,61.4,mmlu-mixed,mixeval_240601,[]
221
- 307,jetmoe-8b-chat,58.5,mmlu-mixed,mixeval_240601,[]
222
- 308,mpt-7b-chat,37.8,mmlu-mixed,mixeval_240601,[]
223
- 309,llama-3-70b,79.8,mmlu-mixed,mixeval_240601,[]
224
- 310,qwen1.5-72b,78.8,mmlu-mixed,mixeval_240601,[]
225
- 311,yi-34b,79.3,mmlu-mixed,mixeval_240601,[]
226
- 312,qwen1.5-32b,77.2,mmlu-mixed,mixeval_240601,[]
227
- 313,mixtral-8x7b,71.6,mmlu-mixed,mixeval_240601,[]
228
- 314,llama-2-70b,70.8,mmlu-mixed,mixeval_240601,[]
229
- 315,qwen1.5-moe-a2.7b,69.4,mmlu-mixed,mixeval_240601,[]
230
- 316,qwen1.5-7b,67.0,mmlu-mixed,mixeval_240601,[]
231
- 317,llama-3-8b,69.5,mmlu-mixed,mixeval_240601,[]
232
- 318,mistral-7b,68.5,mmlu-mixed,mixeval_240601,[]
233
- 319,gemma-7b,67.4,mmlu-mixed,mixeval_240601,[]
234
- 320,yi-6b,71.2,mmlu-mixed,mixeval_240601,[]
235
- 321,qwen1.5-4b,59.6,mmlu-mixed,mixeval_240601,[]
236
- 322,jetmoe-8b,55.3,mmlu-mixed,mixeval_240601,[]
237
- 323,deepseek-7b,53.3,mmlu-mixed,mixeval_240601,[]
238
- 324,phi-2,62.5,mmlu-mixed,mixeval_240601,[]
239
- 325,deepseekmoe-16b,49.9,mmlu-mixed,mixeval_240601,[]
240
- 326,llama-2-7b,40.8,mmlu-mixed,mixeval_240601,[]
241
- 327,gemma-2b,37.4,mmlu-mixed,mixeval_240601,[]
242
- 328,olmo-7b,29.7,mmlu-mixed,mixeval_240601,[]
243
- 329,mpt-7b,30.9,mmlu-mixed,mixeval_240601,[]
244
- 594,gpt-4o-2024-05-13,57.1,mmlu-hard-mixed,mixeval_240601,[]
245
- 595,claude-3-opus,55.0,mmlu-hard-mixed,mixeval_240601,[]
246
- 596,gpt-4-turbo-2024-04-09,45.5,mmlu-hard-mixed,mixeval_240601,[]
247
- 597,gemini-1.5-pro-api-0409,44.6,mmlu-hard-mixed,mixeval_240601,[]
248
- 598,yi-large-preview,48.5,mmlu-hard-mixed,mixeval_240601,[]
249
- 599,llama-3-70b-instruct,46.3,mmlu-hard-mixed,mixeval_240601,[]
250
- 600,qwen-max-0428,41.6,mmlu-hard-mixed,mixeval_240601,[]
251
- 601,claude-3-sonnet,40.7,mmlu-hard-mixed,mixeval_240601,[]
252
- 602,reka-core-20240415,46.3,mmlu-hard-mixed,mixeval_240601,[]
253
- 603,mammoth2-8x7b-plus,41.1,mmlu-hard-mixed,mixeval_240601,[]
254
- 604,deepseek-v2,42.0,mmlu-hard-mixed,mixeval_240601,[]
255
- 605,command-r-plus,42.0,mmlu-hard-mixed,mixeval_240601,[]
256
- 606,yi-1.5-34b-chat,38.1,mmlu-hard-mixed,mixeval_240601,[]
257
- 607,mistral-large,42.4,mmlu-hard-mixed,mixeval_240601,[]
258
- 608,qwen1.5-72b-chat,37.7,mmlu-hard-mixed,mixeval_240601,[]
259
- 609,mistral-medium,38.5,mmlu-hard-mixed,mixeval_240601,[]
260
- 610,gemini-1.0-pro,35.5,mmlu-hard-mixed,mixeval_240601,[]
261
- 611,reka-flash-20240226,34.6,mmlu-hard-mixed,mixeval_240601,[]
262
- 612,mistral-small,33.8,mmlu-hard-mixed,mixeval_240601,[]
263
- 613,llama-3-8b-instruct,40.7,mmlu-hard-mixed,mixeval_240601,[]
264
- 614,command-r,39.0,mmlu-hard-mixed,mixeval_240601,[]
265
- 615,qwen1.5-32b-chat,29.9,mmlu-hard-mixed,mixeval_240601,[]
266
- 616,gpt-3.5-turbo-0125,35.1,mmlu-hard-mixed,mixeval_240601,[]
267
- 617,claude-3-haiku,30.7,mmlu-hard-mixed,mixeval_240601,[]
268
- 618,yi-34b-chat,29.9,mmlu-hard-mixed,mixeval_240601,[]
269
- 619,mixtral-8x7b-instruct-v0.1,37.2,mmlu-hard-mixed,mixeval_240601,[]
270
- 620,starling-lm-7b-beta,34.2,mmlu-hard-mixed,mixeval_240601,[]
271
- 621,yi-1.5-9b-chat,36.8,mmlu-hard-mixed,mixeval_240601,[]
272
- 622,gemma-1.1-7b-it,39.0,mmlu-hard-mixed,mixeval_240601,[]
273
- 623,vicuna-33b-v1.3,39.4,mmlu-hard-mixed,mixeval_240601,[]
274
- 624,llama-2-70b-chat,27.7,mmlu-hard-mixed,mixeval_240601,[]
275
- 625,map-neo-instruct-v0.1,32.5,mmlu-hard-mixed,mixeval_240601,[]
276
- 626,mistral-7b-instruct-v0.2,29.4,mmlu-hard-mixed,mixeval_240601,[]
277
- 627,qwen1.5-7b-chat,29.0,mmlu-hard-mixed,mixeval_240601,[]
278
- 628,reka-edge-20240208,26.4,mmlu-hard-mixed,mixeval_240601,[]
279
- 629,zephyr-7b-beta,24.2,mmlu-hard-mixed,mixeval_240601,[]
280
- 630,llama-2-7b-chat,30.3,mmlu-hard-mixed,mixeval_240601,[]
281
- 631,yi-6b-chat,26.8,mmlu-hard-mixed,mixeval_240601,[]
282
- 632,qwen1.5-moe-a2.7b-chat,26.8,mmlu-hard-mixed,mixeval_240601,[]
283
- 633,gemma-1.1-2b-it,30.3,mmlu-hard-mixed,mixeval_240601,[]
284
- 634,vicuna-7b-v1.5,23.4,mmlu-hard-mixed,mixeval_240601,[]
285
- 635,olmo-7b-instruct,27.3,mmlu-hard-mixed,mixeval_240601,[]
286
- 636,qwen1.5-4b-chat,17.3,mmlu-hard-mixed,mixeval_240601,[]
287
- 637,jetmoe-8b-chat,25.5,mmlu-hard-mixed,mixeval_240601,[]
288
- 638,mpt-7b-chat,24.7,mmlu-hard-mixed,mixeval_240601,[]
289
- 639,llama-3-70b,39.8,mmlu-hard-mixed,mixeval_240601,[]
290
- 640,qwen1.5-72b,42.4,mmlu-hard-mixed,mixeval_240601,[]
291
- 641,yi-34b,42.4,mmlu-hard-mixed,mixeval_240601,[]
292
- 642,qwen1.5-32b,37.2,mmlu-hard-mixed,mixeval_240601,[]
293
- 643,mixtral-8x7b,34.6,mmlu-hard-mixed,mixeval_240601,[]
294
- 644,llama-2-70b,29.0,mmlu-hard-mixed,mixeval_240601,[]
295
- 645,qwen1.5-moe-a2.7b,30.7,mmlu-hard-mixed,mixeval_240601,[]
296
- 646,qwen1.5-7b,28.6,mmlu-hard-mixed,mixeval_240601,[]
297
- 647,llama-3-8b,38.5,mmlu-hard-mixed,mixeval_240601,[]
298
- 648,mistral-7b,27.7,mmlu-hard-mixed,mixeval_240601,[]
299
- 649,gemma-7b,28.1,mmlu-hard-mixed,mixeval_240601,[]
300
- 650,yi-6b,37.2,mmlu-hard-mixed,mixeval_240601,[]
301
- 651,qwen1.5-4b,22.9,mmlu-hard-mixed,mixeval_240601,[]
302
- 652,jetmoe-8b,27.3,mmlu-hard-mixed,mixeval_240601,[]
303
- 653,deepseek-7b,26.4,mmlu-hard-mixed,mixeval_240601,[]
304
- 654,phi-2,29.0,mmlu-hard-mixed,mixeval_240601,[]
305
- 655,deepseekmoe-16b,30.7,mmlu-hard-mixed,mixeval_240601,[]
306
- 656,llama-2-7b,24.7,mmlu-hard-mixed,mixeval_240601,[]
307
- 657,gemma-2b,27.3,mmlu-hard-mixed,mixeval_240601,[]
308
- 658,olmo-7b,25.1,mmlu-hard-mixed,mixeval_240601,[]
309
- 659,mpt-7b,24.2,mmlu-hard-mixed,mixeval_240601,[]
310
- 593,gpt-4-0314,0.57,agieval,BLZ_240312,[]
311
- 594,gpt-4-0613,0.57,agieval,BLZ_240312,[]
312
- 596,claude-1,0.49700000000000005,agieval,BLZ_240312,[]
313
- 601,mixtral-8x7b-instruct-v0.1,0.45299999999999996,agieval,BLZ_240312,[]
314
- 602,yi-34b-chat,0.508,agieval,BLZ_240312,[]
315
- 605,gpt-3.5-turbo-0314,0.43200000000000005,agieval,BLZ_240312,[]
316
- 608,vicuna-33b,0.373,agieval,BLZ_240312,[]
317
- 609,starling-lm-7b-alpha,0.401,agieval,BLZ_240312,[]
318
- 611,llama-2-70b-chat,0.45,agieval,BLZ_240312,[]
319
- 613,openhermes-2.5-mistral-7b,0.43,agieval,BLZ_240312,[]
320
- 614,openchat-3.5,0.42700000000000005,agieval,BLZ_240312,[]
321
- 617,solar-10.7b-instruct-v1.0,0.47600000000000003,agieval,BLZ_240312,[]
322
- 618,dolphin-2.2.1-mistral-7b,0.392,agieval,BLZ_240312,[]
323
- 620,zephyr-7b-beta,0.406,agieval,BLZ_240312,[]
324
- 623,llama-2-13b-chat,0.336,agieval,BLZ_240312,[]
325
- 624,vicuna-13b,0.368,agieval,BLZ_240312,[]
326
- 626,zephyr-7b-alpha,0.38,agieval,BLZ_240312,[]
327
- 627,qwen-14b-chat,0.396,agieval,BLZ_240312,[]
328
- 630,llama-2-7b-chat,0.29600000000000004,agieval,BLZ_240312,[]
329
- 632,mistral-7b-instruct-v0.1,0.335,agieval,BLZ_240312,[]
330
- 634,vicuna-7b,0.314,agieval,BLZ_240312,[]
331
- 636,chatglm3-6b,0.414,agieval,BLZ_240312,[]
332
- 643,chatglm-6b,0.325,agieval,BLZ_240312,[]
333
- 647,llama-13b,0.205,agieval,BLZ_240312,[]
334
- 886,gpt-4-1106-preview,0.977,alpacav1,BLZ_240312,[]
335
- 888,gpt-4-0314,0.9528,alpacav1,BLZ_240312,[]
336
- 889,gpt-4-0613,0.9528,alpacav1,BLZ_240312,[]
337
- 890,mistral-medium,0.9682999999999999,alpacav1,BLZ_240312,[]
338
- 891,claude-1,0.8839,alpacav1,BLZ_240312,[]
339
- 892,claude-2.0,0.9136,alpacav1,BLZ_240312,[]
340
- 893,gemini-pro-dev-api,0.7966,alpacav1,BLZ_240312,[]
341
- 894,claude-2.1,0.8708,alpacav1,BLZ_240312,[]
342
- 895,gpt-3.5-turbo-0613,0.8937,alpacav1,BLZ_240312,[]
343
- 896,mixtral-8x7b-instruct-v0.1,0.9478,alpacav1,BLZ_240312,[]
344
- 897,yi-34b-chat,0.9408,alpacav1,BLZ_240312,[]
345
- 898,gemini-pro,0.7966,alpacav1,BLZ_240312,[]
346
- 900,gpt-3.5-turbo-0314,0.8937,alpacav1,BLZ_240312,[]
347
- 902,tulu-2-dpo-70b,0.9503,alpacav1,BLZ_240312,[]
348
- 903,vicuna-33b,0.8898999999999999,alpacav1,BLZ_240312,[]
349
- 904,starling-lm-7b-alpha,0.9198999999999999,alpacav1,BLZ_240312,[]
350
- 906,llama-2-70b-chat,0.9266,alpacav1,BLZ_240312,[]
351
- 909,openchat-3.5,0.8851,alpacav1,BLZ_240312,[]
352
- 911,gpt-3.5-turbo-1106,0.8626,alpacav1,BLZ_240312,[]
353
- 914,wizardlm-13b-v1.2,0.8917,alpacav1,BLZ_240312,[]
354
- 915,zephyr-7b-beta,0.9059999999999999,alpacav1,BLZ_240312,[]
355
- 918,llama-2-13b-chat,0.8109000000000001,alpacav1,BLZ_240312,[]
356
- 921,zephyr-7b-alpha,0.8576,alpacav1,BLZ_240312,[]
357
- 924,guanaco-33b,0.6596,alpacav1,BLZ_240312,[]
358
- 925,llama-2-7b-chat,0.7137,alpacav1,BLZ_240312,[]
359
- 934,chatglm2-6b,0.47130000000000005,alpacav1,BLZ_240312,[]
360
- 937,openassistant-pythia-12b,0.2596,alpacav1,BLZ_240312,[]
361
- 827,gpt-4-1106-preview,0.5,alpacav2,BLZ_240312,[]
362
- 829,gpt-4-0314,0.221,alpacav2,BLZ_240312,[]
363
- 830,gpt-4-0613,0.158,alpacav2,BLZ_240312,[]
364
- 831,mistral-medium,0.21899999999999997,alpacav2,BLZ_240312,[]
365
- 832,claude-1,0.17,alpacav2,BLZ_240312,[]
366
- 833,claude-2.0,0.172,alpacav2,BLZ_240312,[]
367
- 834,gemini-pro-dev-api,0.16899999999999998,alpacav2,BLZ_240312,[]
368
- 835,claude-2.1,0.157,alpacav2,BLZ_240312,[]
369
- 836,gpt-3.5-turbo-0613,0.141,alpacav2,BLZ_240312,[]
370
- 837,mixtral-8x7b-instruct-v0.1,0.183,alpacav2,BLZ_240312,[]
371
- 838,yi-34b-chat,0.297,alpacav2,BLZ_240312,[]
372
- 839,gemini-pro,0.16899999999999998,alpacav2,BLZ_240312,[]
373
- 840,claude-instant-1,0.161,alpacav2,BLZ_240312,[]
374
- 841,gpt-3.5-turbo-0314,0.096,alpacav2,BLZ_240312,[]
375
- 842,wizardlm-70b-v1.0,0.14400000000000002,alpacav2,BLZ_240312,[]
376
- 843,tulu-2-dpo-70b,0.16,alpacav2,BLZ_240312,[]
377
- 844,vicuna-33b,0.127,alpacav2,BLZ_240312,[]
378
- 845,starling-lm-7b-alpha,0.142,alpacav2,BLZ_240312,[]
379
- 846,deepseek-llm-67b-chat,0.121,alpacav2,BLZ_240312,[]
380
- 847,llama-2-70b-chat,0.139,alpacav2,BLZ_240312,[]
381
- 849,openhermes-2.5-mistral-7b,0.10300000000000001,alpacav2,BLZ_240312,[]
382
- 852,gpt-3.5-turbo-1106,0.092,alpacav2,BLZ_240312,[]
383
- 854,dolphin-2.2.1-mistral-7b,0.09,alpacav2,BLZ_240312,[]
384
- 855,wizardlm-13b-v1.2,0.12,alpacav2,BLZ_240312,[]
385
- 856,zephyr-7b-beta,0.11,alpacav2,BLZ_240312,[]
386
- 859,llama-2-13b-chat,0.077,alpacav2,BLZ_240312,[]
387
- 860,vicuna-13b,0.067,alpacav2,BLZ_240312,[]
388
- 862,zephyr-7b-alpha,0.084,alpacav2,BLZ_240312,[]
389
- 863,qwen-14b-chat,0.075,alpacav2,BLZ_240312,[]
390
- 865,guanaco-33b,0.05,alpacav2,BLZ_240312,[]
391
- 866,llama-2-7b-chat,0.0496,alpacav2,BLZ_240312,[]
392
- 870,vicuna-7b,0.048,alpacav2,BLZ_240312,[]
393
- 875,chatglm2-6b,0.027999999999999997,alpacav2,BLZ_240312,[]
394
- 878,openassistant-pythia-12b,0.018000000000000002,alpacav2,BLZ_240312,[]
395
- 1299,gpt-4-1106-preview,0.32799999999999996,alpacaeval2-lc,BLZ_240312,[]
396
- 1301,gpt-4-0314,0.21600000000000003,alpacaeval2-lc,BLZ_240312,[]
397
- 1302,gpt-4-0613,0.18600000000000003,alpacaeval2-lc,BLZ_240312,[]
398
- 1303,mistral-medium,0.196,alpacaeval2-lc,BLZ_240312,[]
399
- 1304,claude-1,0.21100000000000002,alpacaeval2-lc,BLZ_240312,[]
400
- 1305,claude-2.0,0.21600000000000003,alpacaeval2-lc,BLZ_240312,[]
401
- 1306,gemini-pro-dev-api,0.172,alpacaeval2-lc,BLZ_240312,[]
402
- 1307,claude-2.1,0.193,alpacaeval2-lc,BLZ_240312,[]
403
- 1308,gpt-3.5-turbo-0613,0.14300000000000002,alpacaeval2-lc,BLZ_240312,[]
404
- 1309,mixtral-8x7b-instruct-v0.1,0.168,alpacaeval2-lc,BLZ_240312,[]
405
- 1310,yi-34b-chat,0.188,alpacaeval2-lc,BLZ_240312,[]
406
- 1312,claude-instant-1,0.195,alpacaeval2-lc,BLZ_240312,[]
407
- 1313,gpt-3.5-turbo-0314,0.156,alpacaeval2-lc,BLZ_240312,[]
408
- 1314,wizardlm-70b-v1.0,0.125,alpacaeval2-lc,BLZ_240312,[]
409
- 1315,tulu-2-dpo-70b,0.151,alpacaeval2-lc,BLZ_240312,[]
410
- 1316,vicuna-33b,0.115,alpacaeval2-lc,BLZ_240312,[]
411
- 1317,starling-lm-7b-alpha,0.10099999999999999,alpacaeval2-lc,BLZ_240312,[]
412
- 1318,deepseek-llm-67b-chat,0.141,alpacaeval2-lc,BLZ_240312,[]
413
- 1319,llama-2-70b-chat,0.10400000000000001,alpacaeval2-lc,BLZ_240312,[]
414
- 1321,openhermes-2.5-mistral-7b,0.126,alpacaeval2-lc,BLZ_240312,[]
415
- 1324,gpt-3.5-turbo-1106,0.155,alpacaeval2-lc,BLZ_240312,[]
416
- 1326,dolphin-2.2.1-mistral-7b,0.10800000000000001,alpacaeval2-lc,BLZ_240312,[]
417
- 1327,wizardlm-13b-v1.2,0.099,alpacaeval2-lc,BLZ_240312,[]
418
- 1328,zephyr-7b-beta,0.102,alpacaeval2-lc,BLZ_240312,[]
419
- 1331,llama-2-13b-chat,0.068,alpacaeval2-lc,BLZ_240312,[]
420
- 1332,vicuna-13b,0.085,alpacaeval2-lc,BLZ_240312,[]
421
- 1334,zephyr-7b-alpha,0.086,alpacaeval2-lc,BLZ_240312,[]
422
- 1335,qwen-14b-chat,0.1,alpacaeval2-lc,BLZ_240312,[]
423
- 1338,llama-2-7b-chat,0.045,alpacaeval2-lc,BLZ_240312,[]
424
- 1342,vicuna-7b,0.06,alpacaeval2-lc,BLZ_240312,[]
425
- 0,gpt-4-0125-preview,1.0,arena-elo,BLZ_240312,[]
426
- 1,gpt-4-1106-preview,0.9992019154030327,arena-elo,BLZ_240312,[]
427
- 2,bard-gemini-pro,0.9768555466879489,arena-elo,BLZ_240312,[]
428
- 3,gpt-4-0314,0.9497206703910615,arena-elo,BLZ_240312,[]
429
- 4,gpt-4-0613,0.9273743016759777,arena-elo,BLZ_240312,[]
430
- 5,mistral-medium,0.9177972865123704,arena-elo,BLZ_240312,[]
431
- 6,claude-1,0.9169992019154031,arena-elo,BLZ_240312,[]
432
- 7,claude-2.0,0.9034317637669593,arena-elo,BLZ_240312,[]
433
- 8,gemini-pro-dev-api,0.8938547486033519,arena-elo,BLZ_240312,[]
434
- 9,claude-2.1,0.8930566640063847,arena-elo,BLZ_240312,[]
435
- 10,gpt-3.5-turbo-0613,0.8922585794094174,arena-elo,BLZ_240312,[]
436
- 11,mixtral-8x7b-instruct-v0.1,0.8922585794094174,arena-elo,BLZ_240312,[]
437
- 12,yi-34b-chat,0.8898643256185156,arena-elo,BLZ_240312,[]
438
- 13,gemini-pro,0.8890662410215483,arena-elo,BLZ_240312,[]
439
- 14,claude-instant-1,0.8850758180367119,arena-elo,BLZ_240312,[]
440
- 15,gpt-3.5-turbo-0314,0.8818834796488427,arena-elo,BLZ_240312,[]
441
- 16,wizardlm-70b-v1.0,0.8818834796488427,arena-elo,BLZ_240312,[]
442
- 17,tulu-2-dpo-70b,0.8810853950518756,arena-elo,BLZ_240312,[]
443
- 18,vicuna-33b,0.8723064644852354,arena-elo,BLZ_240312,[]
444
- 19,starling-lm-7b-alpha,0.8699122106943336,arena-elo,BLZ_240312,[]
445
- 20,deepseek-llm-67b-chat,0.8635275339185954,arena-elo,BLZ_240312,[]
446
- 21,llama-2-70b-chat,0.8635275339185954,arena-elo,BLZ_240312,[]
447
- 22,nv-llama2-70b-steerlm-chat,0.8603351955307262,arena-elo,BLZ_240312,[]
448
- 23,openhermes-2.5-mistral-7b,0.8603351955307262,arena-elo,BLZ_240312,[]
449
- 24,openchat-3.5,0.8587390263367917,arena-elo,BLZ_240312,[]
450
- 25,pplx-70b-online,0.8587390263367917,arena-elo,BLZ_240312,[]
451
- 26,gpt-3.5-turbo-1106,0.8547486033519553,arena-elo,BLZ_240312,[]
452
- 27,solar-10.7b-instruct-v1.0,0.8499600957701516,arena-elo,BLZ_240312,[]
453
- 28,dolphin-2.2.1-mistral-7b,0.8499600957701516,arena-elo,BLZ_240312,[]
454
- 29,wizardlm-13b-v1.2,0.8443735035913806,arena-elo,BLZ_240312,[]
455
- 30,zephyr-7b-beta,0.8387869114126097,arena-elo,BLZ_240312,[]
456
- 31,mpt-30b-chat,0.8332003192338387,arena-elo,BLZ_240312,[]
457
- 32,codellama-34b-instruct,0.8324022346368715,arena-elo,BLZ_240312,[]
458
- 33,llama-2-13b-chat,0.8316041500399042,arena-elo,BLZ_240312,[]
459
- 34,vicuna-13b,0.8300079808459697,arena-elo,BLZ_240312,[]
460
- 35,pplx-7b-online,0.8284118116520351,arena-elo,BLZ_240312,[]
461
- 36,zephyr-7b-alpha,0.8276137270550679,arena-elo,BLZ_240312,[]
462
- 37,qwen-14b-chat,0.825219473264166,arena-elo,BLZ_240312,[]
463
- 38,falcon-180b-chat,0.8236233040702314,arena-elo,BLZ_240312,[]
464
- 39,guanaco-33b,0.8236233040702314,arena-elo,BLZ_240312,[]
465
- 40,llama-2-7b-chat,0.8172386272944933,arena-elo,BLZ_240312,[]
466
- 41,stripedhyena-nous-7b,0.8140462889066241,arena-elo,BLZ_240312,[]
467
- 42,mistral-7b-instruct-v0.1,0.8028731045490822,arena-elo,BLZ_240312,[]
468
- 43,palm-chat-bison-001,0.8028731045490822,arena-elo,BLZ_240312,[]
469
- 44,vicuna-7b,0.8020750199521149,arena-elo,BLZ_240312,[]
470
- 45,koala-13b,0.770949720670391,arena-elo,BLZ_240312,[]
471
- 46,chatglm3-6b,0.7661612130885874,arena-elo,BLZ_240312,[]
472
- 47,gpt4all-13b-snoozy,0.74780526735834,arena-elo,BLZ_240312,[]
473
- 48,mpt-7b-chat,0.7430167597765364,arena-elo,BLZ_240312,[]
474
- 49,chatglm2-6b,0.7422186751795691,arena-elo,BLZ_240312,[]
475
- 50,rwkv-4-raven-14b,0.7382282521947326,arena-elo,BLZ_240312,[]
476
- 51,alpaca-13b,0.7214684756584198,arena-elo,BLZ_240312,[]
477
- 52,openassistant-pythia-12b,0.7158818834796489,arena-elo,BLZ_240312,[]
478
- 53,chatglm-6b,0.704708699122107,arena-elo,BLZ_240312,[]
479
- 54,fastchat-t5-3b,0.6975259377494014,arena-elo,BLZ_240312,[]
480
- 55,stablelm-tuned-alpha-7b,0.6743814844373504,arena-elo,BLZ_240312,[]
481
- 56,dolly-v2-12b,0.6568236233040702,arena-elo,BLZ_240312,[]
482
- 57,llama-13b,0.6384676775738228,arena-elo,BLZ_240312,[]
483
- 542,mixtral-8x7b-instruct-v0.1,0.7641,gpt4all,BLZ_240312,[]
484
- 543,yi-34b-chat,0.7212999999999999,gpt4all,BLZ_240312,[]
485
- 550,starling-lm-7b-alpha,0.7272,gpt4all,BLZ_240312,[]
486
- 554,openhermes-2.5-mistral-7b,0.7312000000000001,gpt4all,BLZ_240312,[]
487
- 555,openchat-3.5,0.7292000000000001,gpt4all,BLZ_240312,[]
488
- 558,solar-10.7b-instruct-v1.0,0.7511,gpt4all,BLZ_240312,[]
489
- 559,dolphin-2.2.1-mistral-7b,0.7223999999999999,gpt4all,BLZ_240312,[]
490
- 561,zephyr-7b-beta,0.7182999999999999,gpt4all,BLZ_240312,[]
491
- 565,vicuna-13b,0.631,gpt4all,BLZ_240312,[]
492
- 567,zephyr-7b-alpha,0.7223999999999999,gpt4all,BLZ_240312,[]
493
- 573,mistral-7b-instruct-v0.1,0.6795,gpt4all,BLZ_240312,[]
494
- 575,vicuna-7b,0.61,gpt4all,BLZ_240312,[]
495
- 576,koala-13b,0.62,gpt4all,BLZ_240312,[]
496
- 578,gpt4all-13b-snoozy,0.653,gpt4all,BLZ_240312,[]
497
- 579,mpt-7b-chat,0.648,gpt4all,BLZ_240312,[]
498
- 583,openassistant-pythia-12b,0.61,gpt4all,BLZ_240312,[]
499
- 585,fastchat-t5-3b,0.537,gpt4all,BLZ_240312,[]
500
- 586,stablelm-tuned-alpha-7b,0.513,gpt4all,BLZ_240312,[]
501
- 588,llama-13b,0.63,gpt4all,BLZ_240312,[]
502
- 129,mixtral-8x7b-instruct-v0.1,0.7262000000000001,hugging-6,BLZ_240312,[]
503
- 130,yi-34b-chat,0.6531999999999999,hugging-6,BLZ_240312,[]
504
- 134,wizardlm-70b-v1.0,0.6125,hugging-6,BLZ_240312,[]
505
- 135,tulu-2-dpo-70b,0.7376999999999999,hugging-6,BLZ_240312,[]
506
- 136,vicuna-33b,0.585,hugging-6,BLZ_240312,[]
507
- 137,starling-lm-7b-alpha,0.6713,hugging-6,BLZ_240312,[]
508
- 139,llama-2-70b-chat,0.624,hugging-6,BLZ_240312,[]
509
- 141,openhermes-2.5-mistral-7b,0.6152000000000001,hugging-6,BLZ_240312,[]
510
- 142,openchat-3.5,0.6124,hugging-6,BLZ_240312,[]
511
- 145,solar-10.7b-instruct-v1.0,0.742,hugging-6,BLZ_240312,[]
512
- 146,dolphin-2.2.1-mistral-7b,0.6493000000000001,hugging-6,BLZ_240312,[]
513
- 147,wizardlm-13b-v1.2,0.5476,hugging-6,BLZ_240312,[]
514
- 148,zephyr-7b-beta,0.6195,hugging-6,BLZ_240312,[]
515
- 149,mpt-30b-chat,0.5538000000000001,hugging-6,BLZ_240312,[]
516
- 150,codellama-34b-instruct,0.5729,hugging-6,BLZ_240312,[]
517
- 151,llama-2-13b-chat,0.5490999999999999,hugging-6,BLZ_240312,[]
518
- 152,vicuna-13b,0.5539999999999999,hugging-6,BLZ_240312,[]
519
- 154,zephyr-7b-alpha,0.595,hugging-6,BLZ_240312,[]
520
- 156,falcon-180b-chat,0.6785,hugging-6,BLZ_240312,[]
521
- 158,llama-2-7b-chat,0.5074000000000001,hugging-6,BLZ_240312,[]
522
- 160,mistral-7b-instruct-v0.1,0.5496,hugging-6,BLZ_240312,[]
523
- 162,vicuna-7b,0.521,hugging-6,BLZ_240312,[]
524
- 176,yi-34bx2-moe-60b,0.7672,hugging-6,BLZ_240312,[]
525
- 947,gpt-4-0314,0.93,llmonitor,BLZ_240312,[]
526
- 948,gpt-4-0613,0.89,llmonitor,BLZ_240312,[]
527
- 950,claude-1,0.66,llmonitor,BLZ_240312,[]
528
- 951,claude-2.0,0.68,llmonitor,BLZ_240312,[]
529
- 954,gpt-3.5-turbo-0613,0.81,llmonitor,BLZ_240312,[]
530
- 958,claude-instant-1,0.6,llmonitor,BLZ_240312,[]
531
- 959,gpt-3.5-turbo-0314,0.79,llmonitor,BLZ_240312,[]
532
- 965,llama-2-70b-chat,0.6,llmonitor,BLZ_240312,[]
533
- 975,mpt-30b-chat,0.4,llmonitor,BLZ_240312,[]
534
- 976,codellama-34b-instruct,0.34,llmonitor,BLZ_240312,[]
535
- 977,llama-2-13b-chat,0.5,llmonitor,BLZ_240312,[]
536
- 978,vicuna-13b,0.5,llmonitor,BLZ_240312,[]
537
- 982,falcon-180b-chat,0.67,llmonitor,BLZ_240312,[]
538
- 983,guanaco-33b,0.43,llmonitor,BLZ_240312,[]
539
- 984,llama-2-7b-chat,0.5,llmonitor,BLZ_240312,[]
540
- 986,mistral-7b-instruct-v0.1,0.57,llmonitor,BLZ_240312,[]
541
- 987,palm-chat-bison-001,0.57,llmonitor,BLZ_240312,[]
542
- 988,vicuna-7b,0.41,llmonitor,BLZ_240312,[]
543
- 989,koala-13b,0.31,llmonitor,BLZ_240312,[]
544
- 992,mpt-7b-chat,0.43,llmonitor,BLZ_240312,[]
545
- 1000,dolly-v2-12b,0.23,llmonitor,BLZ_240312,[]
546
- 59,gpt-4-0125-preview,0.0929,mt-bench,BLZ_240312,[]
547
- 60,gpt-4-1106-preview,0.0932,mt-bench,BLZ_240312,[]
548
- 62,gpt-4-0314,0.08960000000000001,mt-bench,BLZ_240312,[]
549
- 63,gpt-4-0613,0.09179999999999999,mt-bench,BLZ_240312,[]
550
- 64,mistral-medium,0.0861,mt-bench,BLZ_240312,[]
551
- 65,claude-1,0.079,mt-bench,BLZ_240312,[]
552
- 66,claude-2.0,0.0806,mt-bench,BLZ_240312,[]
553
- 67,gemini-pro-dev-api,0.08039999999999999,mt-bench,BLZ_240312,[]
554
- 68,claude-2.1,0.0818,mt-bench,BLZ_240312,[]
555
- 69,gpt-3.5-turbo-0613,0.0839,mt-bench,BLZ_240312,[]
556
- 70,mixtral-8x7b-instruct-v0.1,0.083,mt-bench,BLZ_240312,[]
557
- 71,yi-34b-chat,0.07769999999999999,mt-bench,BLZ_240312,[]
558
- 72,gemini-pro,0.08039999999999999,mt-bench,BLZ_240312,[]
559
- 73,claude-instant-1,0.0785,mt-bench,BLZ_240312,[]
560
- 74,gpt-3.5-turbo-0314,0.0794,mt-bench,BLZ_240312,[]
561
- 75,wizardlm-70b-v1.0,0.0771,mt-bench,BLZ_240312,[]
562
- 76,tulu-2-dpo-70b,0.0789,mt-bench,BLZ_240312,[]
563
- 77,vicuna-33b,0.0712,mt-bench,BLZ_240312,[]
564
- 78,starling-lm-7b-alpha,0.0809,mt-bench,BLZ_240312,[]
565
- 79,deepseek-llm-67b-chat,0.08529999999999999,mt-bench,BLZ_240312,[]
566
- 80,llama-2-70b-chat,0.06860000000000001,mt-bench,BLZ_240312,[]
567
- 81,nv-llama2-70b-steerlm-chat,0.0754,mt-bench,BLZ_240312,[]
568
- 82,openhermes-2.5-mistral-7b,0.07690000000000001,mt-bench,BLZ_240312,[]
569
- 83,openchat-3.5,0.0781,mt-bench,BLZ_240312,[]
570
- 84,pplx-70b-online,0.0588,mt-bench,BLZ_240312,[]
571
- 85,gpt-3.5-turbo-1106,0.0832,mt-bench,BLZ_240312,[]
572
- 86,solar-10.7b-instruct-v1.0,0.0758,mt-bench,BLZ_240312,[]
573
- 88,wizardlm-13b-v1.2,0.07200000000000001,mt-bench,BLZ_240312,[]
574
- 89,zephyr-7b-beta,0.07339999999999999,mt-bench,BLZ_240312,[]
575
- 90,mpt-30b-chat,0.0639,mt-bench,BLZ_240312,[]
576
- 92,llama-2-13b-chat,0.0665,mt-bench,BLZ_240312,[]
577
- 93,vicuna-13b,0.06570000000000001,mt-bench,BLZ_240312,[]
578
- 95,zephyr-7b-alpha,0.0688,mt-bench,BLZ_240312,[]
579
- 96,qwen-14b-chat,0.0696,mt-bench,BLZ_240312,[]
580
- 98,guanaco-33b,0.0653,mt-bench,BLZ_240312,[]
581
- 99,llama-2-7b-chat,0.06269999999999999,mt-bench,BLZ_240312,[]
582
- 101,mistral-7b-instruct-v0.1,0.0684,mt-bench,BLZ_240312,[]
583
- 102,palm-chat-bison-001,0.064,mt-bench,BLZ_240312,[]
584
- 103,vicuna-7b,0.0617,mt-bench,BLZ_240312,[]
585
- 104,koala-13b,0.0535,mt-bench,BLZ_240312,[]
586
- 106,gpt4all-13b-snoozy,0.0541,mt-bench,BLZ_240312,[]
587
- 107,mpt-7b-chat,0.0542,mt-bench,BLZ_240312,[]
588
- 108,chatglm2-6b,0.0496,mt-bench,BLZ_240312,[]
589
- 109,rwkv-4-raven-14b,0.0398,mt-bench,BLZ_240312,[]
590
- 110,alpaca-13b,0.0453,mt-bench,BLZ_240312,[]
591
- 111,openassistant-pythia-12b,0.0432,mt-bench,BLZ_240312,[]
592
- 112,chatglm-6b,0.045,mt-bench,BLZ_240312,[]
593
- 113,fastchat-t5-3b,0.0304,mt-bench,BLZ_240312,[]
594
- 114,stablelm-tuned-alpha-7b,0.0275,mt-bench,BLZ_240312,[]
595
- 115,dolly-v2-12b,0.032799999999999996,mt-bench,BLZ_240312,[]
596
- 116,llama-13b,0.026099999999999998,mt-bench,BLZ_240312,[]
597
- 0,gpt-4-0613,0.957,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
598
- 1,llama-3-70b,0.902,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
599
- 2,mixtral-8x22b,0.855,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
600
- 3,palmyra-x-v3-72b,0.826,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
601
- 4,gpt-4-turbo-1106-preview,0.821,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
602
- 5,palm-2-unicorn,0.781,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
603
- 6,claude-3-opus-20240229,0.762,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
604
- 7,qwen1.5-72b,0.757,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
605
- 8,palmyra-x-v2-33b,0.736,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
606
- 9,yi-34b,0.723,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
607
- 10,qwen1.5-32b,0.689,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
608
- 11,claude-v1.3,0.689,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
609
- 12,mixtral-8x7b-32k-seqlen,0.679,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
610
- 13,palm-2-bison,0.655,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
611
- 14,claude-2.0,0.651,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
612
- 15,deepseek-llm-67b-chat,0.645,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
613
- 16,llama-2-70b,0.609,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
614
- 17,claude-2.1,0.594,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
615
- 18,gpt-3.5-text-davinci-003,0.577,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
616
- 19,qwen1.5-14b,0.574,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
617
- 20,claude-instant-1.2,0.551,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
618
- 21,llama-3-8b,0.519,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
619
- 22,gpt-3.5-turbo-0613,0.502,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
620
- 23,gemma-7b,0.47,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
621
- 24,claude-3-sonnet-20240229,0.468,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
622
- 25,gpt-3.5-text-davinci-002,0.468,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
623
- 26,llama-65b,0.466,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
624
- 27,mistral-large-2402,0.46,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
625
- 28,cohere-command,0.421,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
626
- 29,dbrx-instructruct,0.419,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
627
- 30,mistral-v0.1-7b,0.415,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
628
- 31,mistral-small-2402,0.415,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
629
- 32,mistral-medium-2312,0.383,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
630
- 33,qwen1.5-7b,0.377,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
631
- 34,claude-3-haiku-20240307,0.377,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
632
- 35,yi-6b,0.351,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
633
- 36,llama-2-13b,0.332,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
634
- 37,jurassic-2-jumbo-178b,0.317,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
635
- 38,falcon-40b,0.306,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
636
- 39,phi-2,0.26,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
637
- 40,jurassic-2-grande-17b,0.253,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
638
- 41,llama-2-7b,0.234,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
639
- 42,luminous-supreme-70b,0.213,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
640
- 43,cohere-command-light,0.166,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
641
- 44,luminous-extended-30b,0.119,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
642
- 45,falcon-7b,0.1,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
643
- 46,olmo-7b,0.083,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
644
- 47,luminous-base-13b,0.072,helm_lite_mwr,helm_lite_240610,"['narrativeqa', 'naturalquestions_open', 'naturalquestions_closed', 'openbookqa', 'mmlu', 'math', 'gsm8k', 'legalbench', 'medqa', 'wmt_2014']"
645
- 0,llama-2-70b,0.944,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
646
- 1,llama-65b,0.908,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
647
- 2,text-davinci-002,0.905,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
648
- 3,mistral-v0.1-7b,0.884,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
649
- 4,cohere-command-beta-52.4b,0.874,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
650
- 5,text-davinci-003,0.872,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
651
- 6,jurassic-2-jumbo-178b,0.824,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
652
- 7,llama-2-13b,0.823,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
653
- 8,tnlg-v2-530b,0.787,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
654
- 9,gpt-3.5-turbo-0613,0.783,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
655
- 10,llama-30b,0.781,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
656
- 11,anthropic-lm-v4-s3-52b,0.78,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
657
- 12,gpt-3.5-turbo-0301,0.76,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
658
- 13,jurassic-2-grande-17b,0.743,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
659
- 14,palmyra-x-43b,0.732,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
660
- 15,falcon-40b,0.729,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
661
- 16,falcon-instruct-40b,0.727,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
662
- 17,mpt-instruct-30b,0.716,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
663
- 18,mpt-30b,0.714,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
664
- 19,j1-grande-v2-beta-17b,0.706,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
665
- 20,vicuna-v1.3-13b,0.706,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
666
- 21,cohere-command-beta-6.1b,0.675,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
667
- 22,cohere-xlarge-v20221108-52.4b,0.664,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
668
- 23,luminous-supreme-70b,0.662,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
669
- 24,vicuna-v1.3-7b,0.625,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
670
- 25,opt-175b,0.609,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
671
- 26,llama-2-7b,0.607,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
672
- 27,llama-13b,0.595,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
673
- 28,instructpalmyra-30b,0.568,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
674
- 29,cohere-xlarge-v20220609-52.4b,0.56,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
675
- 30,jurassic-2-large-7.5b,0.553,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
676
- 31,davinci-175b,0.538,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
677
- 32,llama-7b,0.533,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
678
- 33,redpajama-incite-instruct-7b,0.524,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
679
- 34,j1-jumbo-v1-178b,0.517,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
680
- 35,glm-130b,0.512,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
681
- 36,luminous-extended-30b,0.485,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
682
- 37,opt-66b,0.448,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
683
- 38,bloom-176b,0.446,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
684
- 39,j1-grande-v1-17b,0.433,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
685
- 40,alpaca-7b,0.381,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
686
- 41,falcon-7b,0.378,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
687
- 42,redpajama-incite-base-7b,0.378,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
688
- 43,cohere-large-v20220720-13.1b,0.372,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
689
- 44,redpajama-incite-instruct-v1-3b,0.366,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
690
- 45,text-curie-001,0.36,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
691
- 46,gpt-neox-20b,0.351,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
692
- 47,luminous-base-13b,0.315,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
693
- 48,cohere-medium-v20221108-6.1b,0.312,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
694
- 49,redpajama-incite-base-v1-3b,0.311,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
695
- 50,tnlg-v2-6.7b,0.309,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
696
- 51,j1-large-v1-7.5b,0.285,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
697
- 52,gpt-j-6b,0.273,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
698
- 53,pythia-12b,0.257,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
699
- 54,curie-6.7b,0.247,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
700
- 55,falcon-instruct-7b,0.244,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
701
- 56,cohere-medium-v20220720-6.1b,0.23,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
702
- 57,text-babbage-001,0.229,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
703
- 58,t0pp-11b,0.197,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
704
- 59,pythia-6.9b,0.196,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
705
- 60,ul2-20b,0.167,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
706
- 61,t5-11b,0.131,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
707
- 62,babbage-1.3b,0.114,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
708
- 63,cohere-small-v20220720-410m,0.109,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
709
- 64,ada-350m,0.108,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
710
- 65,text-ada-001,0.107,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
711
- 66,yalm-100b,0.075,helm_mwr,helm_classic_240130,"['mmlu', 'boolq', 'narrativeqa', 'naturalquestions-closed', 'naturalquestions-open', 'quac', 'hellaswag', 'openbookqa', 'truthfulqa', 'ms-marco-regular', 'ms-marco-trec', 'cnn/dailymail', 'xsum', 'imdb', 'civilcomments', 'raft']"
712
- 67,llama-2-70b,0.582,mmlu,helm_classic_240130,[]
713
- 68,llama-65b,0.584,mmlu,helm_classic_240130,[]
714
- 69,text-davinci-002,0.568,mmlu,helm_classic_240130,[]
715
- 70,mistral-v0.1-7b,0.572,mmlu,helm_classic_240130,[]
716
- 71,cohere-command-beta-52.4b,0.452,mmlu,helm_classic_240130,[]
717
- 72,text-davinci-003,0.569,mmlu,helm_classic_240130,[]
718
- 73,jurassic-2-jumbo-178b,0.48,mmlu,helm_classic_240130,[]
719
- 74,llama-2-13b,0.507,mmlu,helm_classic_240130,[]
720
- 75,tnlg-v2-530b,0.469,mmlu,helm_classic_240130,[]
721
- 76,gpt-3.5-turbo-0613,0.391,mmlu,helm_classic_240130,[]
722
- 77,llama-30b,0.531,mmlu,helm_classic_240130,[]
723
- 78,anthropic-lm-v4-s3-52b,0.481,mmlu,helm_classic_240130,[]
724
- 79,gpt-3.5-turbo-0301,0.59,mmlu,helm_classic_240130,[]
725
- 80,jurassic-2-grande-17b,0.475,mmlu,helm_classic_240130,[]
726
- 81,palmyra-x-43b,0.609,mmlu,helm_classic_240130,[]
727
- 82,falcon-40b,0.509,mmlu,helm_classic_240130,[]
728
- 83,falcon-instruct-40b,0.497,mmlu,helm_classic_240130,[]
729
- 84,mpt-instruct-30b,0.444,mmlu,helm_classic_240130,[]
730
- 85,mpt-30b,0.437,mmlu,helm_classic_240130,[]
731
- 86,j1-grande-v2-beta-17b,0.445,mmlu,helm_classic_240130,[]
732
- 87,vicuna-v1.3-13b,0.462,mmlu,helm_classic_240130,[]
733
- 88,cohere-command-beta-6.1b,0.406,mmlu,helm_classic_240130,[]
734
- 89,cohere-xlarge-v20221108-52.4b,0.382,mmlu,helm_classic_240130,[]
735
- 90,luminous-supreme-70b,0.38,mmlu,helm_classic_240130,[]
736
- 91,vicuna-v1.3-7b,0.434,mmlu,helm_classic_240130,[]
737
- 92,opt-175b,0.318,mmlu,helm_classic_240130,[]
738
- 93,llama-2-7b,0.431,mmlu,helm_classic_240130,[]
739
- 94,llama-13b,0.422,mmlu,helm_classic_240130,[]
740
- 95,instructpalmyra-30b,0.403,mmlu,helm_classic_240130,[]
741
- 96,cohere-xlarge-v20220609-52.4b,0.353,mmlu,helm_classic_240130,[]
742
- 97,jurassic-2-large-7.5b,0.339,mmlu,helm_classic_240130,[]
743
- 98,davinci-175b,0.422,mmlu,helm_classic_240130,[]
744
- 99,llama-7b,0.321,mmlu,helm_classic_240130,[]
745
- 100,redpajama-incite-instruct-7b,0.363,mmlu,helm_classic_240130,[]
746
- 101,j1-jumbo-v1-178b,0.259,mmlu,helm_classic_240130,[]
747
- 102,glm-130b,0.344,mmlu,helm_classic_240130,[]
748
- 103,luminous-extended-30b,0.321,mmlu,helm_classic_240130,[]
749
- 104,opt-66b,0.276,mmlu,helm_classic_240130,[]
750
- 105,bloom-176b,0.299,mmlu,helm_classic_240130,[]
751
- 106,j1-grande-v1-17b,0.27,mmlu,helm_classic_240130,[]
752
- 107,alpaca-7b,0.385,mmlu,helm_classic_240130,[]
753
- 108,falcon-7b,0.286,mmlu,helm_classic_240130,[]
754
- 109,redpajama-incite-base-7b,0.302,mmlu,helm_classic_240130,[]
755
- 110,cohere-large-v20220720-13.1b,0.324,mmlu,helm_classic_240130,[]
756
- 111,redpajama-incite-instruct-v1-3b,0.257,mmlu,helm_classic_240130,[]
757
- 112,text-curie-001,0.237,mmlu,helm_classic_240130,[]
758
- 113,gpt-neox-20b,0.276,mmlu,helm_classic_240130,[]
759
- 114,luminous-base-13b,0.27,mmlu,helm_classic_240130,[]
760
- 115,cohere-medium-v20221108-6.1b,0.254,mmlu,helm_classic_240130,[]
761
- 116,redpajama-incite-base-v1-3b,0.263,mmlu,helm_classic_240130,[]
762
- 117,tnlg-v2-6.7b,0.242,mmlu,helm_classic_240130,[]
763
- 118,j1-large-v1-7.5b,0.241,mmlu,helm_classic_240130,[]
764
- 119,gpt-j-6b,0.249,mmlu,helm_classic_240130,[]
765
- 120,pythia-12b,0.274,mmlu,helm_classic_240130,[]
766
- 121,curie-6.7b,0.243,mmlu,helm_classic_240130,[]
767
- 122,falcon-instruct-7b,0.275,mmlu,helm_classic_240130,[]
768
- 123,cohere-medium-v20220720-6.1b,0.279,mmlu,helm_classic_240130,[]
769
- 124,text-babbage-001,0.229,mmlu,helm_classic_240130,[]
770
- 125,t0pp-11b,0.407,mmlu,helm_classic_240130,[]
771
- 126,pythia-6.9b,0.236,mmlu,helm_classic_240130,[]
772
- 127,ul2-20b,0.291,mmlu,helm_classic_240130,[]
773
- 128,t5-11b,0.29,mmlu,helm_classic_240130,[]
774
- 129,babbage-1.3b,0.235,mmlu,helm_classic_240130,[]
775
- 130,cohere-small-v20220720-410m,0.264,mmlu,helm_classic_240130,[]
776
- 131,ada-350m,0.243,mmlu,helm_classic_240130,[]
777
- 132,text-ada-001,0.238,mmlu,helm_classic_240130,[]
778
- 133,yalm-100b,0.243,mmlu,helm_classic_240130,[]
779
- 0,gpt-4o-0513,35.7,wildbench-mix,wildbench_240612,[]
780
- 1,gpt-4-turbo-0409,34.6,wildbench-mix,wildbench_240612,[]
781
- 2,gpt-4-turbo-0125,29.9,wildbench-mix,wildbench_240612,[]
782
- 3,gemini-1.5-pro,27.8,wildbench-mix,wildbench_240612,[]
783
- 4,llama-3-70b-inst,21.0,wildbench-mix,wildbench_240612,[]
784
- 5,claude-3-opus,20.1,wildbench-mix,wildbench_240612,[]
785
- 6,gemini-1.5-flash,17.4,wildbench-mix,wildbench_240612,[]
786
- 7,yi-1.5-34b-chat,16.8,wildbench-mix,wildbench_240612,[]
787
- 8,llama3-inst-8b-simpo,14.0,wildbench-mix,wildbench_240612,[]
788
- 9,claude-3-sonnet,7.2,wildbench-mix,wildbench_240612,[]
789
- 10,qwen1.5-72b-chat,4.4,wildbench-mix,wildbench_240612,[]
790
- 11,command-r-plus,0.4,wildbench-mix,wildbench_240612,[]
791
- 12,claude-3-haiku,-8.5,wildbench-mix,wildbench_240612,[]
792
- 13,mistral-large,-10.5,wildbench-mix,wildbench_240612,[]
793
- 14,starlinglm-7b-beta,-11.9,wildbench-mix,wildbench_240612,[]
794
- 15,llama-3-8b-inst,-14.6,wildbench-mix,wildbench_240612,[]
795
- 16,command-r,-16.0,wildbench-mix,wildbench_240612,[]
796
- 17,mixtral-8x7b-inst,-18.8,wildbench-mix,wildbench_240612,[]
797
- 18,dbrx-instruct,-21.6,wildbench-mix,wildbench_240612,[]
798
- 19,yi-1.5-6b-chat,-24.3,wildbench-mix,wildbench_240612,[]
799
- 20,mistral-7b-inst-v0.2,-25.0,wildbench-mix,wildbench_240612,[]
800
- 21,tulu-2-dpo-70b,-25.4,wildbench-mix,wildbench_240612,[]
801
- 22,llama-2-70b-chat,-26.8,wildbench-mix,wildbench_240612,[]
802
- 23,qwen1.5-7b-chat,-27.0,wildbench-mix,wildbench_240612,[]
803
- 24,phi-3-medium-128k,-33.3,wildbench-mix,wildbench_240612,[]
804
- 25,gpt-3.5-turbo-0125,-33.5,wildbench-mix,wildbench_240612,[]
805
- 26,llama-2-7b-chat,-48.0,wildbench-mix,wildbench_240612,[]
806
- 27,gemma-7b-it,-57.0,wildbench-mix,wildbench_240612,[]
807
- 28,gemma-2b-it,-74.1,wildbench-mix,wildbench_240612,[]
808
- 13,flan-t5-xxl,0.2244897959183673,mmlu_pro,bluebench_v02,[]
809
- 30,granite-13b-chat-v2,0.2857142857142857,mmlu_pro,bluebench_v02,[]
810
- 41,granite-13b-instruct-v2,0.0408163265306122,mmlu_pro,bluebench_v02,[]
811
- 50,granite-7b-lab,0.2423469387755102,mmlu_pro,bluebench_v02,[]
812
- 60,llama-2-13b-chat,0.0943877551020408,mmlu_pro,bluebench_v02,[]
813
- 70,llama-2-70b,0.4081632653061224,mmlu_pro,bluebench_v02,[]
814
- 81,llama-3-70b-instruct,0.4285714285714285,mmlu_pro,bluebench_v02,[]
815
- 92,llama-3-8b,0.375,mmlu_pro,bluebench_v02,[]
816
- 103,llama-3-8b-instruct,0.0994897959183673,mmlu_pro,bluebench_v02,[]
817
- 112,llama-30b,0.3061224489795918,mmlu_pro,bluebench_v02,[]
818
- 121,llama-7b,0.1326530612244897,mmlu_pro,bluebench_v02,[]
819
- 132,mistral-v0.1-7b,0.2857142857142857,mmlu_pro,bluebench_v02,[]
820
- 143,mixtral-8x7b-instruct-v01,0.375,mmlu_pro,bluebench_v02,[]
821
- 153,vicuna-13b-v1.5-16k,0.2857142857142857,mmlu_pro,bluebench_v02,[]
822
- 162,vicuna-33b-v1.3,0.2653061224489796,mmlu_pro,bluebench_v02,[]
823
- 172,vicuna-v1.3-7b,0.1938775510204081,mmlu_pro,bluebench_v02,[]
824
- 182,vicuna-7b-v1.5,0.2857142857142857,mmlu_pro,bluebench_v02,[]
825
- 192,zephyr-7b-beta,0.2959183673469387,mmlu_pro,bluebench_v02,[]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
assets/livebench.csv DELETED
@@ -1,365 +0,0 @@
1
- ,model,scenario,score,aggragated_from,source
2
- 0,claude_3_5_sonnet_20240620,livebench_lb,61.16,[],livebench_240701
3
- 1,gpt_4o_2024_05_13,livebench_lb,54.96,[],livebench_240701
4
- 2,gpt_4_turbo_2024_04_09,livebench_lb,53.0,[],livebench_240701
5
- 3,gpt_4_1106_preview,livebench_lb,52.17,[],livebench_240701
6
- 4,claude_3_opus_20240229,livebench_lb,50.75,[],livebench_240701
7
- 5,gpt_4_0125_preview,livebench_lb,49.39,[],livebench_240701
8
- 6,deepseek_coder_v2,livebench_lb,46.79,[],livebench_240701
9
- 7,gemini_1.5_pro_api_0514,livebench_lb,44.35,[],livebench_240701
10
- 8,gemma_2_27b_it,livebench_lb,41.22,[],livebench_240701
11
- 9,gemini_1.5_flash_api_0514,livebench_lb,40.89,[],livebench_240701
12
- 10,qwen2_72b_instruct,livebench_lb,40.16,[],livebench_240701
13
- 11,acm_rewrite_qwen2_72b_chat,livebench_lb,39.6,[],livebench_240701
14
- 12,mistral_large_2402,livebench_lb,38.92,[],livebench_240701
15
- 13,deepseek_chat_v2,livebench_lb,38.39,[],livebench_240701
16
- 14,claude_3_sonnet_20240229,livebench_lb,38.08,[],livebench_240701
17
- 15,meta_llama_3_70b_instruct,livebench_lb,37.38,[],livebench_240701
18
- 16,claude_3_haiku_20240307,livebench_lb,35.32,[],livebench_240701
19
- 17,mixtral_8x22b_instruct_v0.1,livebench_lb,34.84,[],livebench_240701
20
- 18,gpt_3.5_turbo_0125,livebench_lb,34.43,[],livebench_240701
21
- 19,gpt_3.5_turbo_1106,livebench_lb,34.14,[],livebench_240701
22
- 20,command_r_plus,livebench_lb,32.86,[],livebench_240701
23
- 21,mistral_small_2402,livebench_lb,32.8,[],livebench_240701
24
- 22,gemma_2_9b_it,livebench_lb,31.57,[],livebench_240701
25
- 23,phi_3_medium_4k_instruct,livebench_lb,30.33,[],livebench_240701
26
- 24,phi_3_medium_128k_instruct,livebench_lb,29.64,[],livebench_240701
27
- 25,deepseek_coder_v2_lite_instruct,livebench_lb,29.15,[],livebench_240701
28
- 26,qwen1.5_110b_chat,livebench_lb,28.96,[],livebench_240701
29
- 27,qwen1.5_72b_chat,livebench_lb,28.89,[],livebench_240701
30
- 28,command_r,livebench_lb,27.23,[],livebench_240701
31
- 29,phi_3_small_128k_instruct,livebench_lb,27.19,[],livebench_240701
32
- 30,meta_llama_3_8b_instruct,livebench_lb,26.67,[],livebench_240701
33
- 31,qwen2_7b_instruct,livebench_lb,26.45,[],livebench_240701
34
- 32,phi_3_small_8k_instruct,livebench_lb,26.24,[],livebench_240701
35
- 33,openhermes_2.5_mistral_7b,livebench_lb,23.3,[],livebench_240701
36
- 34,mixtral_8x7b_instruct_v0.1,livebench_lb,22.5,[],livebench_240701
37
- 35,mistral_7b_instruct_v0.2,livebench_lb,19.33,[],livebench_240701
38
- 36,phi_3_mini_4k_instruct,livebench_lb,19.27,[],livebench_240701
39
- 37,zephyr_7b_alpha,livebench_lb,19.22,[],livebench_240701
40
- 38,phi_3_mini_128k_instruct,livebench_lb,18.04,[],livebench_240701
41
- 39,zephyr_7b_beta,livebench_lb,17.32,[],livebench_240701
42
- 40,deepseek_v2_lite_chat,livebench_lb,17.14,[],livebench_240701
43
- 41,qwen1.5_7b_chat,livebench_lb,16.5,[],livebench_240701
44
- 42,starling_lm_7b_beta,livebench_lb,16.44,[],livebench_240701
45
- 43,vicuna_7b_v1.5_16k,livebench_lb,13.71,[],livebench_240701
46
- 44,vicuna_7b_v1.5,livebench_lb,11.73,[],livebench_240701
47
- 45,qwen1.5_4b_chat,livebench_lb,11.13,[],livebench_240701
48
- 46,llama_2_7b_chat,livebench_lb,10.25,[],livebench_240701
49
- 47,qwen2_1.5b_instruct,livebench_lb,9.96,[],livebench_240701
50
- 48,yi_6b_chat,livebench_lb,8.79,[],livebench_240701
51
- 49,qwen2_0.5b_instruct,livebench_lb,6.78,[],livebench_240701
52
- 50,qwen1.5_1.8b_chat,livebench_lb,6.09,[],livebench_240701
53
- 51,qwen1.5_0.5b_chat,livebench_lb,5.26,[],livebench_240701
54
- 52,claude_3_5_sonnet_20240620,reasoning_lb,64.0,[],livebench_240701
55
- 53,gpt_4o_2024_05_13,reasoning_lb,55.0,[],livebench_240701
56
- 54,gpt_4_turbo_2024_04_09,reasoning_lb,54.0,[],livebench_240701
57
- 55,gpt_4_1106_preview,reasoning_lb,52.0,[],livebench_240701
58
- 56,claude_3_opus_20240229,reasoning_lb,41.0,[],livebench_240701
59
- 57,gpt_4_0125_preview,reasoning_lb,48.0,[],livebench_240701
60
- 58,deepseek_coder_v2,reasoning_lb,49.0,[],livebench_240701
61
- 59,gemini_1.5_pro_api_0514,reasoning_lb,33.0,[],livebench_240701
62
- 60,gemma_2_27b_it,reasoning_lb,31.0,[],livebench_240701
63
- 61,gemini_1.5_flash_api_0514,reasoning_lb,30.0,[],livebench_240701
64
- 62,qwen2_72b_instruct,reasoning_lb,42.0,[],livebench_240701
65
- 63,acm_rewrite_qwen2_72b_chat,reasoning_lb,37.0,[],livebench_240701
66
- 64,mistral_large_2402,reasoning_lb,35.0,[],livebench_240701
67
- 65,deepseek_chat_v2,reasoning_lb,29.0,[],livebench_240701
68
- 66,claude_3_sonnet_20240229,reasoning_lb,26.0,[],livebench_240701
69
- 67,meta_llama_3_70b_instruct,reasoning_lb,31.0,[],livebench_240701
70
- 68,claude_3_haiku_20240307,reasoning_lb,26.0,[],livebench_240701
71
- 69,mixtral_8x22b_instruct_v0.1,reasoning_lb,29.0,[],livebench_240701
72
- 70,gpt_3.5_turbo_0125,reasoning_lb,26.0,[],livebench_240701
73
- 71,gpt_3.5_turbo_1106,reasoning_lb,28.0,[],livebench_240701
74
- 72,command_r_plus,reasoning_lb,32.0,[],livebench_240701
75
- 73,mistral_small_2402,reasoning_lb,28.0,[],livebench_240701
76
- 74,gemma_2_9b_it,reasoning_lb,19.0,[],livebench_240701
77
- 75,phi_3_medium_4k_instruct,reasoning_lb,35.0,[],livebench_240701
78
- 76,phi_3_medium_128k_instruct,reasoning_lb,31.0,[],livebench_240701
79
- 77,deepseek_coder_v2_lite_instruct,reasoning_lb,22.0,[],livebench_240701
80
- 78,qwen1.5_110b_chat,reasoning_lb,26.0,[],livebench_240701
81
- 79,qwen1.5_72b_chat,reasoning_lb,21.0,[],livebench_240701
82
- 80,command_r,reasoning_lb,28.0,[],livebench_240701
83
- 81,phi_3_small_128k_instruct,reasoning_lb,36.0,[],livebench_240701
84
- 82,meta_llama_3_8b_instruct,reasoning_lb,25.0,[],livebench_240701
85
- 83,qwen2_7b_instruct,reasoning_lb,20.0,[],livebench_240701
86
- 84,phi_3_small_8k_instruct,reasoning_lb,23.0,[],livebench_240701
87
- 85,openhermes_2.5_mistral_7b,reasoning_lb,17.0,[],livebench_240701
88
- 86,mixtral_8x7b_instruct_v0.1,reasoning_lb,18.0,[],livebench_240701
89
- 87,mistral_7b_instruct_v0.2,reasoning_lb,13.0,[],livebench_240701
90
- 88,phi_3_mini_4k_instruct,reasoning_lb,19.0,[],livebench_240701
91
- 89,zephyr_7b_alpha,reasoning_lb,17.0,[],livebench_240701
92
- 90,phi_3_mini_128k_instruct,reasoning_lb,10.0,[],livebench_240701
93
- 91,zephyr_7b_beta,reasoning_lb,16.0,[],livebench_240701
94
- 92,deepseek_v2_lite_chat,reasoning_lb,13.0,[],livebench_240701
95
- 93,qwen1.5_7b_chat,reasoning_lb,13.0,[],livebench_240701
96
- 94,starling_lm_7b_beta,reasoning_lb,19.0,[],livebench_240701
97
- 95,vicuna_7b_v1.5_16k,reasoning_lb,15.0,[],livebench_240701
98
- 96,vicuna_7b_v1.5,reasoning_lb,12.0,[],livebench_240701
99
- 97,qwen1.5_4b_chat,reasoning_lb,13.0,[],livebench_240701
100
- 98,llama_2_7b_chat,reasoning_lb,5.0,[],livebench_240701
101
- 99,qwen2_1.5b_instruct,reasoning_lb,8.0,[],livebench_240701
102
- 100,yi_6b_chat,reasoning_lb,8.0,[],livebench_240701
103
- 101,qwen2_0.5b_instruct,reasoning_lb,3.0,[],livebench_240701
104
- 102,qwen1.5_1.8b_chat,reasoning_lb,5.0,[],livebench_240701
105
- 103,qwen1.5_0.5b_chat,reasoning_lb,4.0,[],livebench_240701
106
- 104,claude_3_5_sonnet_20240620,coding_lb,63.21,[],livebench_240701
107
- 105,gpt_4o_2024_05_13,coding_lb,46.37,[],livebench_240701
108
- 106,gpt_4_turbo_2024_04_09,coding_lb,47.05,[],livebench_240701
109
- 107,gpt_4_1106_preview,coding_lb,44.37,[],livebench_240701
110
- 108,claude_3_opus_20240229,coding_lb,40.05,[],livebench_240701
111
- 109,gpt_4_0125_preview,coding_lb,44.05,[],livebench_240701
112
- 110,deepseek_coder_v2,coding_lb,41.05,[],livebench_240701
113
- 111,gemini_1.5_pro_api_0514,coding_lb,32.79,[],livebench_240701
114
- 112,gemma_2_27b_it,coding_lb,36.74,[],livebench_240701
115
- 113,gemini_1.5_flash_api_0514,coding_lb,39.05,[],livebench_240701
116
- 114,qwen2_72b_instruct,coding_lb,31.79,[],livebench_240701
117
- 115,acm_rewrite_qwen2_72b_chat,coding_lb,39.05,[],livebench_240701
118
- 116,mistral_large_2402,coding_lb,26.84,[],livebench_240701
119
- 117,deepseek_chat_v2,coding_lb,33.47,[],livebench_240701
120
- 118,claude_3_sonnet_20240229,coding_lb,25.21,[],livebench_240701
121
- 119,meta_llama_3_70b_instruct,coding_lb,20.95,[],livebench_240701
122
- 120,claude_3_haiku_20240307,coding_lb,24.53,[],livebench_240701
123
- 121,mixtral_8x22b_instruct_v0.1,coding_lb,33.11,[],livebench_240701
124
- 122,gpt_3.5_turbo_0125,coding_lb,29.16,[],livebench_240701
125
- 123,gpt_3.5_turbo_1106,coding_lb,26.84,[],livebench_240701
126
- 124,command_r_plus,coding_lb,20.26,[],livebench_240701
127
- 125,mistral_small_2402,coding_lb,24.21,[],livebench_240701
128
- 126,gemma_2_9b_it,coding_lb,22.21,[],livebench_240701
129
- 127,phi_3_medium_4k_instruct,coding_lb,20.58,[],livebench_240701
130
- 128,phi_3_medium_128k_instruct,coding_lb,21.58,[],livebench_240701
131
- 129,deepseek_coder_v2_lite_instruct,coding_lb,26.84,[],livebench_240701
132
- 130,qwen1.5_110b_chat,coding_lb,22.21,[],livebench_240701
133
- 131,qwen1.5_72b_chat,coding_lb,22.89,[],livebench_240701
134
- 132,command_r,coding_lb,14.95,[],livebench_240701
135
- 133,phi_3_small_128k_instruct,coding_lb,25.84,[],livebench_240701
136
- 134,meta_llama_3_8b_instruct,coding_lb,18.26,[],livebench_240701
137
- 135,qwen2_7b_instruct,coding_lb,29.21,[],livebench_240701
138
- 136,phi_3_small_8k_instruct,coding_lb,19.58,[],livebench_240701
139
- 137,openhermes_2.5_mistral_7b,coding_lb,11.63,[],livebench_240701
140
- 138,mixtral_8x7b_instruct_v0.1,coding_lb,11.32,[],livebench_240701
141
- 139,mistral_7b_instruct_v0.2,coding_lb,11.63,[],livebench_240701
142
- 140,phi_3_mini_4k_instruct,coding_lb,14.95,[],livebench_240701
143
- 141,zephyr_7b_alpha,coding_lb,11.32,[],livebench_240701
144
- 142,phi_3_mini_128k_instruct,coding_lb,11.63,[],livebench_240701
145
- 143,zephyr_7b_beta,coding_lb,8.32,[],livebench_240701
146
- 144,deepseek_v2_lite_chat,coding_lb,8.63,[],livebench_240701
147
- 145,qwen1.5_7b_chat,coding_lb,6.63,[],livebench_240701
148
- 146,starling_lm_7b_beta,coding_lb,18.26,[],livebench_240701
149
- 147,vicuna_7b_v1.5_16k,coding_lb,1.32,[],livebench_240701
150
- 148,vicuna_7b_v1.5,coding_lb,1.0,[],livebench_240701
151
- 149,qwen1.5_4b_chat,coding_lb,4.0,[],livebench_240701
152
- 150,llama_2_7b_chat,coding_lb,0.0,[],livebench_240701
153
- 151,qwen2_1.5b_instruct,coding_lb,5.63,[],livebench_240701
154
- 152,yi_6b_chat,coding_lb,1.32,[],livebench_240701
155
- 153,qwen2_0.5b_instruct,coding_lb,2.0,[],livebench_240701
156
- 154,qwen1.5_1.8b_chat,coding_lb,0.0,[],livebench_240701
157
- 155,qwen1.5_0.5b_chat,coding_lb,0.0,[],livebench_240701
158
- 156,claude_3_5_sonnet_20240620,mathematics_lb,53.75,[],livebench_240701
159
- 157,gpt_4o_2024_05_13,mathematics_lb,49.88,[],livebench_240701
160
- 158,gpt_4_turbo_2024_04_09,mathematics_lb,48.99,[],livebench_240701
161
- 159,gpt_4_1106_preview,mathematics_lb,47.55,[],livebench_240701
162
- 160,claude_3_opus_20240229,mathematics_lb,46.54,[],livebench_240701
163
- 161,gpt_4_0125_preview,mathematics_lb,42.75,[],livebench_240701
164
- 162,deepseek_coder_v2,mathematics_lb,52.19,[],livebench_240701
165
- 163,gemini_1.5_pro_api_0514,mathematics_lb,42.07,[],livebench_240701
166
- 164,gemma_2_27b_it,mathematics_lb,36.23,[],livebench_240701
167
- 165,gemini_1.5_flash_api_0514,mathematics_lb,38.54,[],livebench_240701
168
- 166,qwen2_72b_instruct,mathematics_lb,43.44,[],livebench_240701
169
- 167,acm_rewrite_qwen2_72b_chat,mathematics_lb,40.32,[],livebench_240701
170
- 168,mistral_large_2402,mathematics_lb,32.2,[],livebench_240701
171
- 169,deepseek_chat_v2,mathematics_lb,33.23,[],livebench_240701
172
- 170,claude_3_sonnet_20240229,mathematics_lb,29.65,[],livebench_240701
173
- 171,meta_llama_3_70b_instruct,mathematics_lb,32.31,[],livebench_240701
174
- 172,claude_3_haiku_20240307,mathematics_lb,25.72,[],livebench_240701
175
- 173,mixtral_8x22b_instruct_v0.1,mathematics_lb,26.94,[],livebench_240701
176
- 174,gpt_3.5_turbo_0125,mathematics_lb,25.54,[],livebench_240701
177
- 175,gpt_3.5_turbo_1106,mathematics_lb,28.13,[],livebench_240701
178
- 176,command_r_plus,mathematics_lb,24.85,[],livebench_240701
179
- 177,mistral_small_2402,mathematics_lb,26.76,[],livebench_240701
180
- 178,gemma_2_9b_it,mathematics_lb,23.98,[],livebench_240701
181
- 179,phi_3_medium_4k_instruct,mathematics_lb,27.54,[],livebench_240701
182
- 180,phi_3_medium_128k_instruct,mathematics_lb,24.25,[],livebench_240701
183
- 181,deepseek_coder_v2_lite_instruct,mathematics_lb,34.09,[],livebench_240701
184
- 182,qwen1.5_110b_chat,mathematics_lb,25.58,[],livebench_240701
185
- 183,qwen1.5_72b_chat,mathematics_lb,26.82,[],livebench_240701
186
- 184,command_r,mathematics_lb,16.92,[],livebench_240701
187
- 185,phi_3_small_128k_instruct,mathematics_lb,24.84,[],livebench_240701
188
- 186,meta_llama_3_8b_instruct,mathematics_lb,17.58,[],livebench_240701
189
- 187,qwen2_7b_instruct,mathematics_lb,25.83,[],livebench_240701
190
- 188,phi_3_small_8k_instruct,mathematics_lb,24.15,[],livebench_240701
191
- 189,openhermes_2.5_mistral_7b,mathematics_lb,20.1,[],livebench_240701
192
- 190,mixtral_8x7b_instruct_v0.1,mathematics_lb,18.97,[],livebench_240701
193
- 191,mistral_7b_instruct_v0.2,mathematics_lb,16.04,[],livebench_240701
194
- 192,phi_3_mini_4k_instruct,mathematics_lb,19.88,[],livebench_240701
195
- 193,zephyr_7b_alpha,mathematics_lb,9.61,[],livebench_240701
196
- 194,phi_3_mini_128k_instruct,mathematics_lb,21.48,[],livebench_240701
197
- 195,zephyr_7b_beta,mathematics_lb,11.23,[],livebench_240701
198
- 196,deepseek_v2_lite_chat,mathematics_lb,11.99,[],livebench_240701
199
- 197,qwen1.5_7b_chat,mathematics_lb,12.86,[],livebench_240701
200
- 198,starling_lm_7b_beta,mathematics_lb,13.82,[],livebench_240701
201
- 199,vicuna_7b_v1.5_16k,mathematics_lb,6.61,[],livebench_240701
202
- 200,vicuna_7b_v1.5,mathematics_lb,4.33,[],livebench_240701
203
- 201,qwen1.5_4b_chat,mathematics_lb,7.08,[],livebench_240701
204
- 202,llama_2_7b_chat,mathematics_lb,4.78,[],livebench_240701
205
- 203,qwen2_1.5b_instruct,mathematics_lb,7.16,[],livebench_240701
206
- 204,yi_6b_chat,mathematics_lb,7.14,[],livebench_240701
207
- 205,qwen2_0.5b_instruct,mathematics_lb,4.22,[],livebench_240701
208
- 206,qwen1.5_1.8b_chat,mathematics_lb,2.14,[],livebench_240701
209
- 207,qwen1.5_0.5b_chat,mathematics_lb,3.39,[],livebench_240701
210
- 208,claude_3_5_sonnet_20240620,data_analysis_lb,56.74,[],livebench_240701
211
- 209,gpt_4o_2024_05_13,data_analysis_lb,52.41,[],livebench_240701
212
- 210,gpt_4_turbo_2024_04_09,data_analysis_lb,51.32,[],livebench_240701
213
- 211,gpt_4_1106_preview,data_analysis_lb,51.33,[],livebench_240701
214
- 212,claude_3_opus_20240229,data_analysis_lb,54.32,[],livebench_240701
215
- 213,gpt_4_0125_preview,data_analysis_lb,54.06,[],livebench_240701
216
- 214,deepseek_coder_v2,data_analysis_lb,38.25,[],livebench_240701
217
- 215,gemini_1.5_pro_api_0514,data_analysis_lb,52.81,[],livebench_240701
218
- 216,gemma_2_27b_it,data_analysis_lb,43.58,[],livebench_240701
219
- 217,gemini_1.5_flash_api_0514,data_analysis_lb,44.03,[],livebench_240701
220
- 218,qwen2_72b_instruct,data_analysis_lb,26.24,[],livebench_240701
221
- 219,acm_rewrite_qwen2_72b_chat,data_analysis_lb,26.19,[],livebench_240701
222
- 220,mistral_large_2402,data_analysis_lb,42.55,[],livebench_240701
223
- 221,deepseek_chat_v2,data_analysis_lb,38.03,[],livebench_240701
224
- 222,claude_3_sonnet_20240229,data_analysis_lb,44.56,[],livebench_240701
225
- 223,meta_llama_3_70b_instruct,data_analysis_lb,42.41,[],livebench_240701
226
- 224,claude_3_haiku_20240307,data_analysis_lb,41.54,[],livebench_240701
227
- 225,mixtral_8x22b_instruct_v0.1,data_analysis_lb,30.33,[],livebench_240701
228
- 226,gpt_3.5_turbo_0125,data_analysis_lb,41.21,[],livebench_240701
229
- 227,gpt_3.5_turbo_1106,data_analysis_lb,41.7,[],livebench_240701
230
- 228,command_r_plus,data_analysis_lb,24.6,[],livebench_240701
231
- 229,mistral_small_2402,data_analysis_lb,31.88,[],livebench_240701
232
- 230,gemma_2_9b_it,data_analysis_lb,35.06,[],livebench_240701
233
- 231,phi_3_medium_4k_instruct,data_analysis_lb,31.63,[],livebench_240701
234
- 232,phi_3_medium_128k_instruct,data_analysis_lb,32.12,[],livebench_240701
235
- 233,deepseek_coder_v2_lite_instruct,data_analysis_lb,33.0,[],livebench_240701
236
- 234,qwen1.5_110b_chat,data_analysis_lb,31.45,[],livebench_240701
237
- 235,qwen1.5_72b_chat,data_analysis_lb,32.98,[],livebench_240701
238
- 236,command_r,data_analysis_lb,31.69,[],livebench_240701
239
- 237,phi_3_small_128k_instruct,data_analysis_lb,27.33,[],livebench_240701
240
- 238,meta_llama_3_8b_instruct,data_analysis_lb,23.33,[],livebench_240701
241
- 239,qwen2_7b_instruct,data_analysis_lb,28.75,[],livebench_240701
242
- 240,phi_3_small_8k_instruct,data_analysis_lb,27.5,[],livebench_240701
243
- 241,openhermes_2.5_mistral_7b,data_analysis_lb,26.92,[],livebench_240701
244
- 242,mixtral_8x7b_instruct_v0.1,data_analysis_lb,28.13,[],livebench_240701
245
- 243,mistral_7b_instruct_v0.2,data_analysis_lb,14.62,[],livebench_240701
246
- 244,phi_3_mini_4k_instruct,data_analysis_lb,14.67,[],livebench_240701
247
- 245,zephyr_7b_alpha,data_analysis_lb,17.4,[],livebench_240701
248
- 246,phi_3_mini_128k_instruct,data_analysis_lb,8.69,[],livebench_240701
249
- 247,zephyr_7b_beta,data_analysis_lb,15.75,[],livebench_240701
250
- 248,deepseek_v2_lite_chat,data_analysis_lb,18.19,[],livebench_240701
251
- 249,qwen1.5_7b_chat,data_analysis_lb,16.23,[],livebench_240701
252
- 250,starling_lm_7b_beta,data_analysis_lb,2.0,[],livebench_240701
253
- 251,vicuna_7b_v1.5_16k,data_analysis_lb,9.27,[],livebench_240701
254
- 252,vicuna_7b_v1.5,data_analysis_lb,2.67,[],livebench_240701
255
- 253,qwen1.5_4b_chat,data_analysis_lb,9.13,[],livebench_240701
256
- 254,llama_2_7b_chat,data_analysis_lb,0.0,[],livebench_240701
257
- 255,qwen2_1.5b_instruct,data_analysis_lb,10.01,[],livebench_240701
258
- 256,yi_6b_chat,data_analysis_lb,4.38,[],livebench_240701
259
- 257,qwen2_0.5b_instruct,data_analysis_lb,2.0,[],livebench_240701
260
- 258,qwen1.5_1.8b_chat,data_analysis_lb,3.33,[],livebench_240701
261
- 259,qwen1.5_0.5b_chat,data_analysis_lb,0.0,[],livebench_240701
262
- 260,claude_3_5_sonnet_20240620,language_lb,56.94,[],livebench_240701
263
- 261,gpt_4o_2024_05_13,language_lb,53.94,[],livebench_240701
264
- 262,gpt_4_turbo_2024_04_09,language_lb,45.26,[],livebench_240701
265
- 263,gpt_4_1106_preview,language_lb,48.37,[],livebench_240701
266
- 264,claude_3_opus_20240229,language_lb,51.72,[],livebench_240701
267
- 265,gpt_4_0125_preview,language_lb,43.55,[],livebench_240701
268
- 266,deepseek_coder_v2,language_lb,33.04,[],livebench_240701
269
- 267,gemini_1.5_pro_api_0514,language_lb,38.25,[],livebench_240701
270
- 268,gemma_2_27b_it,language_lb,32.4,[],livebench_240701
271
- 269,gemini_1.5_flash_api_0514,language_lb,30.69,[],livebench_240701
272
- 270,qwen2_72b_instruct,language_lb,29.21,[],livebench_240701
273
- 271,acm_rewrite_qwen2_72b_chat,language_lb,30.03,[],livebench_240701
274
- 272,mistral_large_2402,language_lb,28.74,[],livebench_240701
275
- 273,deepseek_chat_v2,language_lb,32.29,[],livebench_240701
276
- 274,claude_3_sonnet_20240229,language_lb,38.08,[],livebench_240701
277
- 275,meta_llama_3_70b_instruct,language_lb,34.11,[],livebench_240701
278
- 276,claude_3_haiku_20240307,language_lb,30.07,[],livebench_240701
279
- 277,mixtral_8x22b_instruct_v0.1,language_lb,26.48,[],livebench_240701
280
- 278,gpt_3.5_turbo_0125,language_lb,24.22,[],livebench_240701
281
- 279,gpt_3.5_turbo_1106,language_lb,28.63,[],livebench_240701
282
- 280,command_r_plus,language_lb,23.92,[],livebench_240701
283
- 281,mistral_small_2402,language_lb,22.06,[],livebench_240701
284
- 282,gemma_2_9b_it,language_lb,27.64,[],livebench_240701
285
- 283,phi_3_medium_4k_instruct,language_lb,13.91,[],livebench_240701
286
- 284,phi_3_medium_128k_instruct,language_lb,12.76,[],livebench_240701
287
- 285,deepseek_coder_v2_lite_instruct,language_lb,10.64,[],livebench_240701
288
- 286,qwen1.5_110b_chat,language_lb,13.22,[],livebench_240701
289
- 287,qwen1.5_72b_chat,language_lb,11.37,[],livebench_240701
290
- 288,command_r,language_lb,14.64,[],livebench_240701
291
- 289,phi_3_small_128k_instruct,language_lb,12.28,[],livebench_240701
292
- 290,meta_llama_3_8b_instruct,language_lb,18.72,[],livebench_240701
293
- 291,qwen2_7b_instruct,language_lb,10.21,[],livebench_240701
294
- 292,phi_3_small_8k_instruct,language_lb,14.96,[],livebench_240701
295
- 293,openhermes_2.5_mistral_7b,language_lb,11.37,[],livebench_240701
296
- 294,mixtral_8x7b_instruct_v0.1,language_lb,13.76,[],livebench_240701
297
- 295,mistral_7b_instruct_v0.2,language_lb,9.05,[],livebench_240701
298
- 296,phi_3_mini_4k_instruct,language_lb,7.1,[],livebench_240701
299
- 297,zephyr_7b_alpha,language_lb,7.2,[],livebench_240701
300
- 298,phi_3_mini_128k_instruct,language_lb,6.8,[],livebench_240701
301
- 299,zephyr_7b_beta,language_lb,4.28,[],livebench_240701
302
- 300,deepseek_v2_lite_chat,language_lb,9.2,[],livebench_240701
303
- 301,qwen1.5_7b_chat,language_lb,6.18,[],livebench_240701
304
- 302,starling_lm_7b_beta,language_lb,7.26,[],livebench_240701
305
- 303,vicuna_7b_v1.5_16k,language_lb,7.92,[],livebench_240701
306
- 304,vicuna_7b_v1.5,language_lb,8.66,[],livebench_240701
307
- 305,qwen1.5_4b_chat,language_lb,5.8,[],livebench_240701
308
- 306,llama_2_7b_chat,language_lb,6.86,[],livebench_240701
309
- 307,qwen2_1.5b_instruct,language_lb,3.05,[],livebench_240701
310
- 308,yi_6b_chat,language_lb,4.69,[],livebench_240701
311
- 309,qwen2_0.5b_instruct,language_lb,2.8,[],livebench_240701
312
- 310,qwen1.5_1.8b_chat,language_lb,3.16,[],livebench_240701
313
- 311,qwen1.5_0.5b_chat,language_lb,2.88,[],livebench_240701
314
- 312,claude_3_5_sonnet_20240620,if_lb,72.3,[],livebench_240701
315
- 313,gpt_4o_2024_05_13,if_lb,72.17,[],livebench_240701
316
- 314,gpt_4_turbo_2024_04_09,if_lb,71.39,[],livebench_240701
317
- 315,gpt_4_1106_preview,if_lb,69.39,[],livebench_240701
318
- 316,claude_3_opus_20240229,if_lb,70.87,[],livebench_240701
319
- 317,gpt_4_0125_preview,if_lb,63.92,[],livebench_240701
320
- 318,deepseek_coder_v2,if_lb,67.18,[],livebench_240701
321
- 319,gemini_1.5_pro_api_0514,if_lb,67.2,[],livebench_240701
322
- 320,gemma_2_27b_it,if_lb,67.37,[],livebench_240701
323
- 321,gemini_1.5_flash_api_0514,if_lb,63.01,[],livebench_240701
324
- 322,qwen2_72b_instruct,if_lb,68.27,[],livebench_240701
325
- 323,acm_rewrite_qwen2_72b_chat,if_lb,65.0,[],livebench_240701
326
- 324,mistral_large_2402,if_lb,68.19,[],livebench_240701
327
- 325,deepseek_chat_v2,if_lb,64.34,[],livebench_240701
328
- 326,claude_3_sonnet_20240229,if_lb,65.0,[],livebench_240701
329
- 327,meta_llama_3_70b_instruct,if_lb,63.5,[],livebench_240701
330
- 328,claude_3_haiku_20240307,if_lb,64.03,[],livebench_240701
331
- 329,mixtral_8x22b_instruct_v0.1,if_lb,63.17,[],livebench_240701
332
- 330,gpt_3.5_turbo_0125,if_lb,60.47,[],livebench_240701
333
- 331,gpt_3.5_turbo_1106,if_lb,51.53,[],livebench_240701
334
- 332,command_r_plus,if_lb,71.51,[],livebench_240701
335
- 333,mistral_small_2402,if_lb,63.91,[],livebench_240701
336
- 334,gemma_2_9b_it,if_lb,61.55,[],livebench_240701
337
- 335,phi_3_medium_4k_instruct,if_lb,53.3,[],livebench_240701
338
- 336,phi_3_medium_128k_instruct,if_lb,56.15,[],livebench_240701
339
- 337,deepseek_coder_v2_lite_instruct,if_lb,48.34,[],livebench_240701
340
- 338,qwen1.5_110b_chat,if_lb,55.26,[],livebench_240701
341
- 339,qwen1.5_72b_chat,if_lb,58.25,[],livebench_240701
342
- 340,command_r,if_lb,57.16,[],livebench_240701
343
- 341,phi_3_small_128k_instruct,if_lb,36.88,[],livebench_240701
344
- 342,meta_llama_3_8b_instruct,if_lb,57.14,[],livebench_240701
345
- 343,qwen2_7b_instruct,if_lb,44.74,[],livebench_240701
346
- 344,phi_3_small_8k_instruct,if_lb,48.24,[],livebench_240701
347
- 345,openhermes_2.5_mistral_7b,if_lb,52.78,[],livebench_240701
348
- 346,mixtral_8x7b_instruct_v0.1,if_lb,44.81,[],livebench_240701
349
- 347,mistral_7b_instruct_v0.2,if_lb,51.65,[],livebench_240701
350
- 348,phi_3_mini_4k_instruct,if_lb,40.05,[],livebench_240701
351
- 349,zephyr_7b_alpha,if_lb,52.79,[],livebench_240701
352
- 350,phi_3_mini_128k_instruct,if_lb,49.65,[],livebench_240701
353
- 351,zephyr_7b_beta,if_lb,48.32,[],livebench_240701
354
- 352,deepseek_v2_lite_chat,if_lb,41.83,[],livebench_240701
355
- 353,qwen1.5_7b_chat,if_lb,44.12,[],livebench_240701
356
- 354,starling_lm_7b_beta,if_lb,38.32,[],livebench_240701
357
- 355,vicuna_7b_v1.5_16k,if_lb,42.12,[],livebench_240701
358
- 356,vicuna_7b_v1.5,if_lb,41.75,[],livebench_240701
359
- 357,qwen1.5_4b_chat,if_lb,27.75,[],livebench_240701
360
- 358,llama_2_7b_chat,if_lb,44.88,[],livebench_240701
361
- 359,qwen2_1.5b_instruct,if_lb,25.9,[],livebench_240701
362
- 360,yi_6b_chat,if_lb,27.22,[],livebench_240701
363
- 361,qwen2_0.5b_instruct,if_lb,26.63,[],livebench_240701
364
- 362,qwen1.5_1.8b_chat,if_lb,22.9,[],livebench_240701
365
- 363,qwen1.5_0.5b_chat,if_lb,21.3,[],livebench_240701
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
assets/pointplot_granularity_matters.png ADDED
cache/agreements_cache_42471fdfe00c7ff9b0aba18b66ab5a5f.csv ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ scenario,scenario_source,ref_scenario,ref_source,corr_type,model_select_strategy,model_subset_size_requested,exp_n,correlation,p_value
2
+ hellaswag,BLZ_240312,aggregate,holistic,kendall,top_aggregate,5,0,0.19999999999999998,0.8166666666666667
3
+ humaneval,BLZ_240312,aggregate,holistic,kendall,top_aggregate,5,0,0.5270462766947298,0.206507295485425
4
+ mbpp,BLZ_240312,aggregate,holistic,kendall,top_aggregate,5,0,0.10540925533894598,0.8005421074231263
5
+ winogrande,BLZ_240312,aggregate,holistic,kendall,top_aggregate,5,0,-0.19999999999999998,0.8166666666666667
6
+ grounding,biggen_240612,aggregate,holistic,kendall,top_aggregate,5,0,0.9486832980505137,0.02297740150320607
7
+ instruction_following,biggen_240612,aggregate,holistic,kendall,top_aggregate,5,0,0.9486832980505137,0.02297740150320607
8
+ planning,biggen_240612,aggregate,holistic,kendall,top_aggregate,5,0,0.19999999999999998,0.8166666666666667
9
+ reasoning,biggen_240612,aggregate,holistic,kendall,top_aggregate,5,0,0.7999999999999999,0.08333333333333333
10
+ refinement,biggen_240612,aggregate,holistic,kendall,top_aggregate,5,0,0.39999999999999997,0.48333333333333334
11
+ safety,biggen_240612,aggregate,holistic,kendall,top_aggregate,5,0,0.6,0.23333333333333334
12
+ theory_of_mind,biggen_240612,aggregate,holistic,kendall,top_aggregate,5,0,0.7999999999999999,0.08333333333333333
13
+ tool_usage,biggen_240612,aggregate,holistic,kendall,top_aggregate,5,0,0.7999999999999999,0.08333333333333333
14
+ livebench_average,livebench_240701,aggregate,holistic,kendall,top_aggregate,5,0,0.6,0.23333333333333334
15
+ reasoning_average,livebench_240701,aggregate,holistic,kendall,top_aggregate,5,0,0.7378647873726218,0.07697417298126676
16
+ coding_average,livebench_240701,aggregate,holistic,kendall,top_aggregate,5,0,0.5270462766947298,0.206507295485425
17
+ mathematics_average,livebench_240701,aggregate,holistic,kendall,top_aggregate,5,0,0.6,0.23333333333333334
18
+ data_analysis_average,livebench_240701,aggregate,holistic,kendall,top_aggregate,5,0,0.7999999999999999,0.08333333333333333
19
+ language_average,livebench_240701,aggregate,holistic,kendall,top_aggregate,5,0,0.7999999999999999,0.08333333333333333
20
+ if_average,livebench_240701,aggregate,holistic,kendall,top_aggregate,5,0,0.19999999999999998,0.8166666666666667
21
+ arena_hard,arena_hard_2404,aggregate,holistic,kendall,top_aggregate,5,0,0.6,0.23333333333333334
22
+ mixeval,mixeval_240601,aggregate,holistic,kendall,top_aggregate,5,0,0.7999999999999999,0.08333333333333333
23
+ agieval,BLZ_240312,aggregate,holistic,kendall,top_aggregate,5,0,0.7378647873726218,0.07697417298126676
24
+ arc_c,BLZ_240312,aggregate,holistic,kendall,top_aggregate,5,0,0.6,0.23333333333333334
25
+ alpacav1,BLZ_240312,aggregate,holistic,kendall,top_aggregate,5,0,0.7378647873726218,0.07697417298126676
26
+ alpacav2,BLZ_240312,aggregate,holistic,kendall,top_aggregate,5,0,0.7999999999999999,0.08333333333333333
27
+ alpacaeval2_lc,BLZ_240312,aggregate,holistic,kendall,top_aggregate,5,0,0.5270462766947298,0.206507295485425
28
+ arena_elo,BLZ_240312,aggregate,holistic,kendall,top_aggregate,5,0,0.7999999999999999,0.08333333333333333
29
+ bbh,BLZ_240312,aggregate,holistic,kendall,top_aggregate,5,0,0.31622776601683794,0.44848886103153174
30
+ eq_benchv2,BLZ_240312,aggregate,holistic,kendall,top_aggregate,5,0,0.7999999999999999,0.08333333333333333
31
+ gpt4all,BLZ_240312,aggregate,holistic,kendall,top_aggregate,5,0,0.39999999999999997,0.48333333333333334
32
+ hugging_6,BLZ_240312,aggregate,holistic,kendall,top_aggregate,5,0,0.39999999999999997,0.48333333333333334
33
+ llmonitor,BLZ_240312,aggregate,holistic,kendall,top_aggregate,5,0,0.6,0.23333333333333334
34
+ magi,BLZ_240312,aggregate,holistic,kendall,top_aggregate,5,0,0.19999999999999998,0.8166666666666667
35
+ mmlu,BLZ_240312,aggregate,holistic,kendall,top_aggregate,5,0,0.6,0.23333333333333334
36
+ mt_bench,BLZ_240312,aggregate,holistic,kendall,top_aggregate,5,0,0.6,0.23333333333333334
37
+ biggen_mwr,biggen_240612,aggregate,holistic,kendall,top_aggregate,5,0,0.6,0.23333333333333334
38
+ aggregate,holistic,hellaswag,BLZ_240312,kendall,top_aggregate,5,0,0.19999999999999998,0.8166666666666667
39
+ aggregate,holistic,humaneval,BLZ_240312,kendall,top_aggregate,5,0,0.5270462766947298,0.206507295485425
40
+ aggregate,holistic,mbpp,BLZ_240312,kendall,top_aggregate,5,0,0.10540925533894598,0.8005421074231263
41
+ aggregate,holistic,winogrande,BLZ_240312,kendall,top_aggregate,5,0,-0.19999999999999998,0.8166666666666667
42
+ aggregate,holistic,grounding,biggen_240612,kendall,top_aggregate,5,0,0.9486832980505137,0.02297740150320607
43
+ aggregate,holistic,instruction_following,biggen_240612,kendall,top_aggregate,5,0,0.9486832980505137,0.02297740150320607
44
+ aggregate,holistic,planning,biggen_240612,kendall,top_aggregate,5,0,0.19999999999999998,0.8166666666666667
45
+ aggregate,holistic,reasoning,biggen_240612,kendall,top_aggregate,5,0,0.7999999999999999,0.08333333333333333
46
+ aggregate,holistic,refinement,biggen_240612,kendall,top_aggregate,5,0,0.39999999999999997,0.48333333333333334
47
+ aggregate,holistic,safety,biggen_240612,kendall,top_aggregate,5,0,0.6,0.23333333333333334
48
+ aggregate,holistic,theory_of_mind,biggen_240612,kendall,top_aggregate,5,0,0.7999999999999999,0.08333333333333333
49
+ aggregate,holistic,tool_usage,biggen_240612,kendall,top_aggregate,5,0,0.7999999999999999,0.08333333333333333
50
+ aggregate,holistic,livebench_average,livebench_240701,kendall,top_aggregate,5,0,0.6,0.23333333333333334
51
+ aggregate,holistic,reasoning_average,livebench_240701,kendall,top_aggregate,5,0,0.7378647873726218,0.07697417298126676
52
+ aggregate,holistic,coding_average,livebench_240701,kendall,top_aggregate,5,0,0.5270462766947298,0.206507295485425
53
+ aggregate,holistic,mathematics_average,livebench_240701,kendall,top_aggregate,5,0,0.6,0.23333333333333334
54
+ aggregate,holistic,data_analysis_average,livebench_240701,kendall,top_aggregate,5,0,0.7999999999999999,0.08333333333333333
55
+ aggregate,holistic,language_average,livebench_240701,kendall,top_aggregate,5,0,0.7999999999999999,0.08333333333333333
56
+ aggregate,holistic,if_average,livebench_240701,kendall,top_aggregate,5,0,0.19999999999999998,0.8166666666666667
57
+ aggregate,holistic,arena_hard,arena_hard_2404,kendall,top_aggregate,5,0,0.6,0.23333333333333334
58
+ aggregate,holistic,mixeval,mixeval_240601,kendall,top_aggregate,5,0,0.7999999999999999,0.08333333333333333
59
+ aggregate,holistic,agieval,BLZ_240312,kendall,top_aggregate,5,0,0.7378647873726218,0.07697417298126676
60
+ aggregate,holistic,arc_c,BLZ_240312,kendall,top_aggregate,5,0,0.6,0.23333333333333334
61
+ aggregate,holistic,alpacav1,BLZ_240312,kendall,top_aggregate,5,0,0.7378647873726218,0.07697417298126676
62
+ aggregate,holistic,alpacav2,BLZ_240312,kendall,top_aggregate,5,0,0.7999999999999999,0.08333333333333333
63
+ aggregate,holistic,alpacaeval2_lc,BLZ_240312,kendall,top_aggregate,5,0,0.5270462766947298,0.206507295485425
64
+ aggregate,holistic,arena_elo,BLZ_240312,kendall,top_aggregate,5,0,0.7999999999999999,0.08333333333333333
65
+ aggregate,holistic,bbh,BLZ_240312,kendall,top_aggregate,5,0,0.31622776601683794,0.44848886103153174
66
+ aggregate,holistic,eq_benchv2,BLZ_240312,kendall,top_aggregate,5,0,0.7999999999999999,0.08333333333333333
67
+ aggregate,holistic,gpt4all,BLZ_240312,kendall,top_aggregate,5,0,0.39999999999999997,0.48333333333333334
68
+ aggregate,holistic,hugging_6,BLZ_240312,kendall,top_aggregate,5,0,0.39999999999999997,0.48333333333333334
69
+ aggregate,holistic,llmonitor,BLZ_240312,kendall,top_aggregate,5,0,0.6,0.23333333333333334
70
+ aggregate,holistic,magi,BLZ_240312,kendall,top_aggregate,5,0,0.19999999999999998,0.8166666666666667
71
+ aggregate,holistic,mmlu,BLZ_240312,kendall,top_aggregate,5,0,0.6,0.23333333333333334
72
+ aggregate,holistic,mt_bench,BLZ_240312,kendall,top_aggregate,5,0,0.6,0.23333333333333334
73
+ aggregate,holistic,biggen_mwr,biggen_240612,kendall,top_aggregate,5,0,0.6,0.23333333333333334
cache/agreements_cache_6ac32881b7d0a3bf6d8762ff242ff449.csv ADDED
@@ -0,0 +1,721 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ scenario,scenario_source,ref_scenario,ref_source,corr_type,model_select_strategy,model_subset_size_requested,exp_n,correlation,p_value
2
+ hellaswag,BLZ_240312,aggregate,holistic,kendall,random,5,0,0.39999999999999997,0.48333333333333334
3
+ hellaswag,BLZ_240312,aggregate,holistic,kendall,random,5,1,0.7999999999999999,0.08333333333333333
4
+ hellaswag,BLZ_240312,aggregate,holistic,kendall,random,5,2,0.7999999999999999,0.08333333333333333
5
+ hellaswag,BLZ_240312,aggregate,holistic,kendall,random,5,3,0.6,0.23333333333333334
6
+ hellaswag,BLZ_240312,aggregate,holistic,kendall,random,5,4,0.6,0.23333333333333334
7
+ hellaswag,BLZ_240312,aggregate,holistic,kendall,random,5,5,0.39999999999999997,0.48333333333333334
8
+ hellaswag,BLZ_240312,aggregate,holistic,kendall,random,5,6,0.39999999999999997,0.48333333333333334
9
+ hellaswag,BLZ_240312,aggregate,holistic,kendall,random,5,7,0.0,1.0
10
+ hellaswag,BLZ_240312,aggregate,holistic,kendall,random,5,8,0.19999999999999998,0.8166666666666667
11
+ hellaswag,BLZ_240312,aggregate,holistic,kendall,random,5,9,0.7999999999999999,0.08333333333333333
12
+ humaneval,BLZ_240312,aggregate,holistic,kendall,random,5,0,0.19999999999999998,0.8166666666666667
13
+ humaneval,BLZ_240312,aggregate,holistic,kendall,random,5,1,0.7999999999999999,0.08333333333333333
14
+ humaneval,BLZ_240312,aggregate,holistic,kendall,random,5,2,0.7999999999999999,0.08333333333333333
15
+ humaneval,BLZ_240312,aggregate,holistic,kendall,random,5,3,0.31622776601683794,0.44848886103153174
16
+ humaneval,BLZ_240312,aggregate,holistic,kendall,random,5,4,0.9999999999999999,0.016666666666666666
17
+ humaneval,BLZ_240312,aggregate,holistic,kendall,random,5,5,0.39999999999999997,0.48333333333333334
18
+ humaneval,BLZ_240312,aggregate,holistic,kendall,random,5,6,0.39999999999999997,0.48333333333333334
19
+ humaneval,BLZ_240312,aggregate,holistic,kendall,random,5,7,0.9999999999999999,0.016666666666666666
20
+ humaneval,BLZ_240312,aggregate,holistic,kendall,random,5,8,0.39999999999999997,0.48333333333333334
21
+ humaneval,BLZ_240312,aggregate,holistic,kendall,random,5,9,0.39999999999999997,0.48333333333333334
22
+ mbpp,BLZ_240312,aggregate,holistic,kendall,random,5,0,0.9999999999999999,0.016666666666666666
23
+ mbpp,BLZ_240312,aggregate,holistic,kendall,random,5,1,0.5270462766947298,0.206507295485425
24
+ mbpp,BLZ_240312,aggregate,holistic,kendall,random,5,2,0.6,0.23333333333333334
25
+ mbpp,BLZ_240312,aggregate,holistic,kendall,random,5,3,0.6,0.23333333333333334
26
+ mbpp,BLZ_240312,aggregate,holistic,kendall,random,5,4,0.7999999999999999,0.08333333333333333
27
+ mbpp,BLZ_240312,aggregate,holistic,kendall,random,5,5,0.5270462766947298,0.206507295485425
28
+ mbpp,BLZ_240312,aggregate,holistic,kendall,random,5,6,0.7999999999999999,0.08333333333333333
29
+ mbpp,BLZ_240312,aggregate,holistic,kendall,random,5,7,0.39999999999999997,0.48333333333333334
30
+ mbpp,BLZ_240312,aggregate,holistic,kendall,random,5,8,0.39999999999999997,0.48333333333333334
31
+ mbpp,BLZ_240312,aggregate,holistic,kendall,random,5,9,-0.10540925533894598,0.8005421074231263
32
+ winogrande,BLZ_240312,aggregate,holistic,kendall,random,5,0,-0.10540925533894596,0.8005421074231263
33
+ winogrande,BLZ_240312,aggregate,holistic,kendall,random,5,1,0.7999999999999999,0.08333333333333333
34
+ winogrande,BLZ_240312,aggregate,holistic,kendall,random,5,2,0.7999999999999999,0.08333333333333333
35
+ winogrande,BLZ_240312,aggregate,holistic,kendall,random,5,3,0.9999999999999999,0.016666666666666666
36
+ winogrande,BLZ_240312,aggregate,holistic,kendall,random,5,4,0.19999999999999998,0.8166666666666667
37
+ winogrande,BLZ_240312,aggregate,holistic,kendall,random,5,5,0.7999999999999999,0.08333333333333333
38
+ winogrande,BLZ_240312,aggregate,holistic,kendall,random,5,6,0.39999999999999997,0.48333333333333334
39
+ winogrande,BLZ_240312,aggregate,holistic,kendall,random,5,7,0.0,1.0
40
+ winogrande,BLZ_240312,aggregate,holistic,kendall,random,5,8,0.0,1.0
41
+ winogrande,BLZ_240312,aggregate,holistic,kendall,random,5,9,0.7999999999999999,0.08333333333333333
42
+ grounding,biggen_240612,aggregate,holistic,kendall,random,5,0,0.39999999999999997,0.48333333333333334
43
+ grounding,biggen_240612,aggregate,holistic,kendall,random,5,1,0.9999999999999999,0.016666666666666666
44
+ grounding,biggen_240612,aggregate,holistic,kendall,random,5,2,0.39999999999999997,0.48333333333333334
45
+ grounding,biggen_240612,aggregate,holistic,kendall,random,5,3,0.9999999999999999,0.016666666666666666
46
+ grounding,biggen_240612,aggregate,holistic,kendall,random,5,4,0.7999999999999999,0.08333333333333333
47
+ grounding,biggen_240612,aggregate,holistic,kendall,random,5,5,0.9486832980505137,0.02297740150320607
48
+ grounding,biggen_240612,aggregate,holistic,kendall,random,5,6,0.9999999999999999,0.016666666666666666
49
+ grounding,biggen_240612,aggregate,holistic,kendall,random,5,7,0.19999999999999998,0.8166666666666667
50
+ grounding,biggen_240612,aggregate,holistic,kendall,random,5,8,0.9999999999999999,0.016666666666666666
51
+ grounding,biggen_240612,aggregate,holistic,kendall,random,5,9,0.9999999999999999,0.016666666666666666
52
+ instruction_following,biggen_240612,aggregate,holistic,kendall,random,5,0,0.19999999999999998,0.8166666666666667
53
+ instruction_following,biggen_240612,aggregate,holistic,kendall,random,5,1,0.9999999999999999,0.016666666666666666
54
+ instruction_following,biggen_240612,aggregate,holistic,kendall,random,5,2,0.0,1.0
55
+ instruction_following,biggen_240612,aggregate,holistic,kendall,random,5,3,0.9999999999999999,0.016666666666666666
56
+ instruction_following,biggen_240612,aggregate,holistic,kendall,random,5,4,0.7999999999999999,0.08333333333333333
57
+ instruction_following,biggen_240612,aggregate,holistic,kendall,random,5,5,0.9486832980505137,0.02297740150320607
58
+ instruction_following,biggen_240612,aggregate,holistic,kendall,random,5,6,0.9999999999999999,0.016666666666666666
59
+ instruction_following,biggen_240612,aggregate,holistic,kendall,random,5,7,-0.10540925533894598,0.8005421074231263
60
+ instruction_following,biggen_240612,aggregate,holistic,kendall,random,5,8,0.9486832980505137,0.02297740150320607
61
+ instruction_following,biggen_240612,aggregate,holistic,kendall,random,5,9,0.9999999999999999,0.016666666666666666
62
+ planning,biggen_240612,aggregate,holistic,kendall,random,5,0,0.6,0.23333333333333334
63
+ planning,biggen_240612,aggregate,holistic,kendall,random,5,1,0.6,0.23333333333333334
64
+ planning,biggen_240612,aggregate,holistic,kendall,random,5,2,0.6,0.23333333333333334
65
+ planning,biggen_240612,aggregate,holistic,kendall,random,5,3,0.6,0.23333333333333334
66
+ planning,biggen_240612,aggregate,holistic,kendall,random,5,4,0.9999999999999999,0.016666666666666666
67
+ planning,biggen_240612,aggregate,holistic,kendall,random,5,5,0.39999999999999997,0.48333333333333334
68
+ planning,biggen_240612,aggregate,holistic,kendall,random,5,6,0.6,0.23333333333333334
69
+ planning,biggen_240612,aggregate,holistic,kendall,random,5,7,0.19999999999999998,0.8166666666666667
70
+ planning,biggen_240612,aggregate,holistic,kendall,random,5,8,0.9999999999999999,0.016666666666666666
71
+ planning,biggen_240612,aggregate,holistic,kendall,random,5,9,0.6,0.23333333333333334
72
+ reasoning,biggen_240612,aggregate,holistic,kendall,random,5,0,0.39999999999999997,0.48333333333333334
73
+ reasoning,biggen_240612,aggregate,holistic,kendall,random,5,1,0.7999999999999999,0.08333333333333333
74
+ reasoning,biggen_240612,aggregate,holistic,kendall,random,5,2,0.7378647873726218,0.07697417298126676
75
+ reasoning,biggen_240612,aggregate,holistic,kendall,random,5,3,0.9999999999999999,0.016666666666666666
76
+ reasoning,biggen_240612,aggregate,holistic,kendall,random,5,4,0.9999999999999999,0.016666666666666666
77
+ reasoning,biggen_240612,aggregate,holistic,kendall,random,5,5,0.9999999999999999,0.016666666666666666
78
+ reasoning,biggen_240612,aggregate,holistic,kendall,random,5,6,0.9999999999999999,0.016666666666666666
79
+ reasoning,biggen_240612,aggregate,holistic,kendall,random,5,7,0.39999999999999997,0.48333333333333334
80
+ reasoning,biggen_240612,aggregate,holistic,kendall,random,5,8,0.7999999999999999,0.08333333333333333
81
+ reasoning,biggen_240612,aggregate,holistic,kendall,random,5,9,0.9999999999999999,0.016666666666666666
82
+ refinement,biggen_240612,aggregate,holistic,kendall,random,5,0,0.6,0.23333333333333334
83
+ refinement,biggen_240612,aggregate,holistic,kendall,random,5,1,0.7999999999999999,0.08333333333333333
84
+ refinement,biggen_240612,aggregate,holistic,kendall,random,5,2,0.19999999999999998,0.8166666666666667
85
+ refinement,biggen_240612,aggregate,holistic,kendall,random,5,3,0.7999999999999999,0.08333333333333333
86
+ refinement,biggen_240612,aggregate,holistic,kendall,random,5,4,0.6,0.23333333333333334
87
+ refinement,biggen_240612,aggregate,holistic,kendall,random,5,5,0.39999999999999997,0.48333333333333334
88
+ refinement,biggen_240612,aggregate,holistic,kendall,random,5,6,0.9999999999999999,0.016666666666666666
89
+ refinement,biggen_240612,aggregate,holistic,kendall,random,5,7,-0.19999999999999998,0.8166666666666667
90
+ refinement,biggen_240612,aggregate,holistic,kendall,random,5,8,0.7999999999999999,0.08333333333333333
91
+ refinement,biggen_240612,aggregate,holistic,kendall,random,5,9,0.9999999999999999,0.016666666666666666
92
+ safety,biggen_240612,aggregate,holistic,kendall,random,5,0,0.39999999999999997,0.48333333333333334
93
+ safety,biggen_240612,aggregate,holistic,kendall,random,5,1,0.19999999999999998,0.8166666666666667
94
+ safety,biggen_240612,aggregate,holistic,kendall,random,5,2,-0.6,0.23333333333333334
95
+ safety,biggen_240612,aggregate,holistic,kendall,random,5,3,0.0,1.0
96
+ safety,biggen_240612,aggregate,holistic,kendall,random,5,4,0.39999999999999997,0.48333333333333334
97
+ safety,biggen_240612,aggregate,holistic,kendall,random,5,5,0.0,1.0
98
+ safety,biggen_240612,aggregate,holistic,kendall,random,5,6,0.39999999999999997,0.48333333333333334
99
+ safety,biggen_240612,aggregate,holistic,kendall,random,5,7,-0.6,0.23333333333333334
100
+ safety,biggen_240612,aggregate,holistic,kendall,random,5,8,-0.19999999999999998,0.8166666666666667
101
+ safety,biggen_240612,aggregate,holistic,kendall,random,5,9,0.39999999999999997,0.48333333333333334
102
+ theory_of_mind,biggen_240612,aggregate,holistic,kendall,random,5,0,0.6,0.23333333333333334
103
+ theory_of_mind,biggen_240612,aggregate,holistic,kendall,random,5,1,0.9999999999999999,0.016666666666666666
104
+ theory_of_mind,biggen_240612,aggregate,holistic,kendall,random,5,2,0.19999999999999998,0.8166666666666667
105
+ theory_of_mind,biggen_240612,aggregate,holistic,kendall,random,5,3,0.7999999999999999,0.08333333333333333
106
+ theory_of_mind,biggen_240612,aggregate,holistic,kendall,random,5,4,0.9999999999999999,0.016666666666666666
107
+ theory_of_mind,biggen_240612,aggregate,holistic,kendall,random,5,5,0.7999999999999999,0.08333333333333333
108
+ theory_of_mind,biggen_240612,aggregate,holistic,kendall,random,5,6,0.9999999999999999,0.016666666666666666
109
+ theory_of_mind,biggen_240612,aggregate,holistic,kendall,random,5,7,-0.19999999999999998,0.8166666666666667
110
+ theory_of_mind,biggen_240612,aggregate,holistic,kendall,random,5,8,0.9999999999999999,0.016666666666666666
111
+ theory_of_mind,biggen_240612,aggregate,holistic,kendall,random,5,9,0.9999999999999999,0.016666666666666666
112
+ tool_usage,biggen_240612,aggregate,holistic,kendall,random,5,0,0.6,0.23333333333333334
113
+ tool_usage,biggen_240612,aggregate,holistic,kendall,random,5,1,0.7999999999999999,0.08333333333333333
114
+ tool_usage,biggen_240612,aggregate,holistic,kendall,random,5,2,0.9999999999999999,0.016666666666666666
115
+ tool_usage,biggen_240612,aggregate,holistic,kendall,random,5,3,0.7999999999999999,0.08333333333333333
116
+ tool_usage,biggen_240612,aggregate,holistic,kendall,random,5,4,0.6,0.23333333333333334
117
+ tool_usage,biggen_240612,aggregate,holistic,kendall,random,5,5,0.6,0.23333333333333334
118
+ tool_usage,biggen_240612,aggregate,holistic,kendall,random,5,6,0.7999999999999999,0.08333333333333333
119
+ tool_usage,biggen_240612,aggregate,holistic,kendall,random,5,7,0.0,1.0
120
+ tool_usage,biggen_240612,aggregate,holistic,kendall,random,5,8,0.7999999999999999,0.08333333333333333
121
+ tool_usage,biggen_240612,aggregate,holistic,kendall,random,5,9,0.9999999999999999,0.016666666666666666
122
+ livebench_average,livebench_240701,aggregate,holistic,kendall,random,5,0,0.39999999999999997,0.48333333333333334
123
+ livebench_average,livebench_240701,aggregate,holistic,kendall,random,5,1,0.7999999999999999,0.08333333333333333
124
+ livebench_average,livebench_240701,aggregate,holistic,kendall,random,5,2,0.6,0.23333333333333334
125
+ livebench_average,livebench_240701,aggregate,holistic,kendall,random,5,3,0.39999999999999997,0.48333333333333334
126
+ livebench_average,livebench_240701,aggregate,holistic,kendall,random,5,4,0.6,0.23333333333333334
127
+ livebench_average,livebench_240701,aggregate,holistic,kendall,random,5,5,0.7999999999999999,0.08333333333333333
128
+ livebench_average,livebench_240701,aggregate,holistic,kendall,random,5,6,0.6,0.23333333333333334
129
+ livebench_average,livebench_240701,aggregate,holistic,kendall,random,5,7,0.7999999999999999,0.08333333333333333
130
+ livebench_average,livebench_240701,aggregate,holistic,kendall,random,5,8,0.6,0.23333333333333334
131
+ livebench_average,livebench_240701,aggregate,holistic,kendall,random,5,9,0.7999999999999999,0.08333333333333333
132
+ reasoning_average,livebench_240701,aggregate,holistic,kendall,random,5,0,0.6,0.23333333333333334
133
+ reasoning_average,livebench_240701,aggregate,holistic,kendall,random,5,1,0.7378647873726218,0.07697417298126676
134
+ reasoning_average,livebench_240701,aggregate,holistic,kendall,random,5,2,0.6,0.23333333333333334
135
+ reasoning_average,livebench_240701,aggregate,holistic,kendall,random,5,3,0.5270462766947298,0.206507295485425
136
+ reasoning_average,livebench_240701,aggregate,holistic,kendall,random,5,4,0.7378647873726218,0.07697417298126676
137
+ reasoning_average,livebench_240701,aggregate,holistic,kendall,random,5,5,0.7378647873726218,0.07697417298126676
138
+ reasoning_average,livebench_240701,aggregate,holistic,kendall,random,5,6,0.7378647873726218,0.07697417298126676
139
+ reasoning_average,livebench_240701,aggregate,holistic,kendall,random,5,7,0.7999999999999999,0.08333333333333333
140
+ reasoning_average,livebench_240701,aggregate,holistic,kendall,random,5,8,0.7378647873726218,0.07697417298126676
141
+ reasoning_average,livebench_240701,aggregate,holistic,kendall,random,5,9,0.7378647873726218,0.07697417298126676
142
+ coding_average,livebench_240701,aggregate,holistic,kendall,random,5,0,0.39999999999999997,0.48333333333333334
143
+ coding_average,livebench_240701,aggregate,holistic,kendall,random,5,1,0.7999999999999999,0.08333333333333333
144
+ coding_average,livebench_240701,aggregate,holistic,kendall,random,5,2,0.6,0.23333333333333334
145
+ coding_average,livebench_240701,aggregate,holistic,kendall,random,5,3,0.31622776601683794,0.44848886103153174
146
+ coding_average,livebench_240701,aggregate,holistic,kendall,random,5,4,0.5270462766947298,0.206507295485425
147
+ coding_average,livebench_240701,aggregate,holistic,kendall,random,5,5,0.7999999999999999,0.08333333333333333
148
+ coding_average,livebench_240701,aggregate,holistic,kendall,random,5,6,0.5270462766947298,0.206507295485425
149
+ coding_average,livebench_240701,aggregate,holistic,kendall,random,5,7,0.7378647873726218,0.07697417298126676
150
+ coding_average,livebench_240701,aggregate,holistic,kendall,random,5,8,0.5270462766947298,0.206507295485425
151
+ coding_average,livebench_240701,aggregate,holistic,kendall,random,5,9,0.7999999999999999,0.08333333333333333
152
+ mathematics_average,livebench_240701,aggregate,holistic,kendall,random,5,0,0.39999999999999997,0.48333333333333334
153
+ mathematics_average,livebench_240701,aggregate,holistic,kendall,random,5,1,0.9999999999999999,0.016666666666666666
154
+ mathematics_average,livebench_240701,aggregate,holistic,kendall,random,5,2,0.6,0.23333333333333334
155
+ mathematics_average,livebench_240701,aggregate,holistic,kendall,random,5,3,0.39999999999999997,0.48333333333333334
156
+ mathematics_average,livebench_240701,aggregate,holistic,kendall,random,5,4,0.7999999999999999,0.08333333333333333
157
+ mathematics_average,livebench_240701,aggregate,holistic,kendall,random,5,5,0.9999999999999999,0.016666666666666666
158
+ mathematics_average,livebench_240701,aggregate,holistic,kendall,random,5,6,0.6,0.23333333333333334
159
+ mathematics_average,livebench_240701,aggregate,holistic,kendall,random,5,7,0.7999999999999999,0.08333333333333333
160
+ mathematics_average,livebench_240701,aggregate,holistic,kendall,random,5,8,0.7999999999999999,0.08333333333333333
161
+ mathematics_average,livebench_240701,aggregate,holistic,kendall,random,5,9,0.9999999999999999,0.016666666666666666
162
+ data_analysis_average,livebench_240701,aggregate,holistic,kendall,random,5,0,0.6,0.23333333333333334
163
+ data_analysis_average,livebench_240701,aggregate,holistic,kendall,random,5,1,0.7999999999999999,0.08333333333333333
164
+ data_analysis_average,livebench_240701,aggregate,holistic,kendall,random,5,2,0.7999999999999999,0.08333333333333333
165
+ data_analysis_average,livebench_240701,aggregate,holistic,kendall,random,5,3,0.6,0.23333333333333334
166
+ data_analysis_average,livebench_240701,aggregate,holistic,kendall,random,5,4,0.7999999999999999,0.08333333333333333
167
+ data_analysis_average,livebench_240701,aggregate,holistic,kendall,random,5,5,0.7999999999999999,0.08333333333333333
168
+ data_analysis_average,livebench_240701,aggregate,holistic,kendall,random,5,6,0.9999999999999999,0.016666666666666666
169
+ data_analysis_average,livebench_240701,aggregate,holistic,kendall,random,5,7,0.9999999999999999,0.016666666666666666
170
+ data_analysis_average,livebench_240701,aggregate,holistic,kendall,random,5,8,0.7999999999999999,0.08333333333333333
171
+ data_analysis_average,livebench_240701,aggregate,holistic,kendall,random,5,9,0.7999999999999999,0.08333333333333333
172
+ language_average,livebench_240701,aggregate,holistic,kendall,random,5,0,0.6,0.23333333333333334
173
+ language_average,livebench_240701,aggregate,holistic,kendall,random,5,1,0.6,0.23333333333333334
174
+ language_average,livebench_240701,aggregate,holistic,kendall,random,5,2,0.6,0.23333333333333334
175
+ language_average,livebench_240701,aggregate,holistic,kendall,random,5,3,0.6,0.23333333333333334
176
+ language_average,livebench_240701,aggregate,holistic,kendall,random,5,4,0.7999999999999999,0.08333333333333333
177
+ language_average,livebench_240701,aggregate,holistic,kendall,random,5,5,0.6,0.23333333333333334
178
+ language_average,livebench_240701,aggregate,holistic,kendall,random,5,6,0.7999999999999999,0.08333333333333333
179
+ language_average,livebench_240701,aggregate,holistic,kendall,random,5,7,0.7999999999999999,0.08333333333333333
180
+ language_average,livebench_240701,aggregate,holistic,kendall,random,5,8,0.7999999999999999,0.08333333333333333
181
+ language_average,livebench_240701,aggregate,holistic,kendall,random,5,9,0.6,0.23333333333333334
182
+ if_average,livebench_240701,aggregate,holistic,kendall,random,5,0,0.39999999999999997,0.48333333333333334
183
+ if_average,livebench_240701,aggregate,holistic,kendall,random,5,1,0.6,0.23333333333333334
184
+ if_average,livebench_240701,aggregate,holistic,kendall,random,5,2,0.7999999999999999,0.08333333333333333
185
+ if_average,livebench_240701,aggregate,holistic,kendall,random,5,3,0.0,1.0
186
+ if_average,livebench_240701,aggregate,holistic,kendall,random,5,4,0.0,1.0
187
+ if_average,livebench_240701,aggregate,holistic,kendall,random,5,5,0.6,0.23333333333333334
188
+ if_average,livebench_240701,aggregate,holistic,kendall,random,5,6,0.19999999999999998,0.8166666666666667
189
+ if_average,livebench_240701,aggregate,holistic,kendall,random,5,7,0.39999999999999997,0.48333333333333334
190
+ if_average,livebench_240701,aggregate,holistic,kendall,random,5,8,0.0,1.0
191
+ if_average,livebench_240701,aggregate,holistic,kendall,random,5,9,0.6,0.23333333333333334
192
+ arena_hard,arena_hard_2404,aggregate,holistic,kendall,random,5,0,0.9999999999999999,0.016666666666666666
193
+ arena_hard,arena_hard_2404,aggregate,holistic,kendall,random,5,1,0.39999999999999997,0.48333333333333334
194
+ arena_hard,arena_hard_2404,aggregate,holistic,kendall,random,5,2,0.7999999999999999,0.08333333333333333
195
+ arena_hard,arena_hard_2404,aggregate,holistic,kendall,random,5,3,0.9999999999999999,0.016666666666666666
196
+ arena_hard,arena_hard_2404,aggregate,holistic,kendall,random,5,4,0.7999999999999999,0.08333333333333333
197
+ arena_hard,arena_hard_2404,aggregate,holistic,kendall,random,5,5,0.6,0.23333333333333334
198
+ arena_hard,arena_hard_2404,aggregate,holistic,kendall,random,5,6,0.9999999999999999,0.016666666666666666
199
+ arena_hard,arena_hard_2404,aggregate,holistic,kendall,random,5,7,0.7999999999999999,0.08333333333333333
200
+ arena_hard,arena_hard_2404,aggregate,holistic,kendall,random,5,8,0.9999999999999999,0.016666666666666666
201
+ arena_hard,arena_hard_2404,aggregate,holistic,kendall,random,5,9,0.9486832980505138,0.02297740150320607
202
+ mixeval,mixeval_240601,aggregate,holistic,kendall,random,5,0,0.7999999999999999,0.08333333333333333
203
+ mixeval,mixeval_240601,aggregate,holistic,kendall,random,5,1,0.9999999999999999,0.016666666666666666
204
+ mixeval,mixeval_240601,aggregate,holistic,kendall,random,5,2,0.7999999999999999,0.08333333333333333
205
+ mixeval,mixeval_240601,aggregate,holistic,kendall,random,5,3,0.9999999999999999,0.016666666666666666
206
+ mixeval,mixeval_240601,aggregate,holistic,kendall,random,5,4,0.7999999999999999,0.08333333333333333
207
+ mixeval,mixeval_240601,aggregate,holistic,kendall,random,5,5,0.9999999999999999,0.016666666666666666
208
+ mixeval,mixeval_240601,aggregate,holistic,kendall,random,5,6,0.7999999999999999,0.08333333333333333
209
+ mixeval,mixeval_240601,aggregate,holistic,kendall,random,5,7,0.9999999999999999,0.016666666666666666
210
+ mixeval,mixeval_240601,aggregate,holistic,kendall,random,5,8,0.7999999999999999,0.08333333333333333
211
+ mixeval,mixeval_240601,aggregate,holistic,kendall,random,5,9,0.9999999999999999,0.016666666666666666
212
+ agieval,BLZ_240312,aggregate,holistic,kendall,random,5,0,0.5270462766947299,0.206507295485425
213
+ agieval,BLZ_240312,aggregate,holistic,kendall,random,5,1,0.9999999999999999,0.016666666666666666
214
+ agieval,BLZ_240312,aggregate,holistic,kendall,random,5,2,0.6,0.23333333333333334
215
+ agieval,BLZ_240312,aggregate,holistic,kendall,random,5,3,0.7999999999999999,0.08333333333333333
216
+ agieval,BLZ_240312,aggregate,holistic,kendall,random,5,4,0.7999999999999999,0.08333333333333333
217
+ agieval,BLZ_240312,aggregate,holistic,kendall,random,5,5,0.7999999999999999,0.08333333333333333
218
+ agieval,BLZ_240312,aggregate,holistic,kendall,random,5,6,0.9999999999999999,0.016666666666666666
219
+ agieval,BLZ_240312,aggregate,holistic,kendall,random,5,7,0.6,0.23333333333333334
220
+ agieval,BLZ_240312,aggregate,holistic,kendall,random,5,8,0.19999999999999998,0.8166666666666667
221
+ agieval,BLZ_240312,aggregate,holistic,kendall,random,5,9,0.6,0.23333333333333334
222
+ arc_c,BLZ_240312,aggregate,holistic,kendall,random,5,0,0.5270462766947298,0.206507295485425
223
+ arc_c,BLZ_240312,aggregate,holistic,kendall,random,5,1,0.9999999999999999,0.016666666666666666
224
+ arc_c,BLZ_240312,aggregate,holistic,kendall,random,5,2,0.9999999999999999,0.016666666666666666
225
+ arc_c,BLZ_240312,aggregate,holistic,kendall,random,5,3,0.9999999999999999,0.016666666666666666
226
+ arc_c,BLZ_240312,aggregate,holistic,kendall,random,5,4,0.7999999999999999,0.08333333333333333
227
+ arc_c,BLZ_240312,aggregate,holistic,kendall,random,5,5,0.7999999999999999,0.08333333333333333
228
+ arc_c,BLZ_240312,aggregate,holistic,kendall,random,5,6,0.7999999999999999,0.08333333333333333
229
+ arc_c,BLZ_240312,aggregate,holistic,kendall,random,5,7,0.9999999999999999,0.016666666666666666
230
+ arc_c,BLZ_240312,aggregate,holistic,kendall,random,5,8,0.19999999999999998,0.8166666666666667
231
+ arc_c,BLZ_240312,aggregate,holistic,kendall,random,5,9,0.7999999999999999,0.08333333333333333
232
+ alpacav1,BLZ_240312,aggregate,holistic,kendall,random,5,0,0.19999999999999998,0.8166666666666667
233
+ alpacav1,BLZ_240312,aggregate,holistic,kendall,random,5,1,0.9999999999999999,0.016666666666666666
234
+ alpacav1,BLZ_240312,aggregate,holistic,kendall,random,5,2,0.6,0.23333333333333334
235
+ alpacav1,BLZ_240312,aggregate,holistic,kendall,random,5,3,0.9999999999999999,0.016666666666666666
236
+ alpacav1,BLZ_240312,aggregate,holistic,kendall,random,5,4,0.7999999999999999,0.08333333333333333
237
+ alpacav1,BLZ_240312,aggregate,holistic,kendall,random,5,5,0.39999999999999997,0.48333333333333334
238
+ alpacav1,BLZ_240312,aggregate,holistic,kendall,random,5,6,0.19999999999999998,0.8166666666666667
239
+ alpacav1,BLZ_240312,aggregate,holistic,kendall,random,5,7,0.39999999999999997,0.48333333333333334
240
+ alpacav1,BLZ_240312,aggregate,holistic,kendall,random,5,8,0.9999999999999999,0.016666666666666666
241
+ alpacav1,BLZ_240312,aggregate,holistic,kendall,random,5,9,0.7999999999999999,0.08333333333333333
242
+ alpacav2,BLZ_240312,aggregate,holistic,kendall,random,5,0,0.6,0.23333333333333334
243
+ alpacav2,BLZ_240312,aggregate,holistic,kendall,random,5,1,0.7999999999999999,0.08333333333333333
244
+ alpacav2,BLZ_240312,aggregate,holistic,kendall,random,5,2,0.7999999999999999,0.08333333333333333
245
+ alpacav2,BLZ_240312,aggregate,holistic,kendall,random,5,3,0.9999999999999999,0.016666666666666666
246
+ alpacav2,BLZ_240312,aggregate,holistic,kendall,random,5,4,0.7999999999999999,0.08333333333333333
247
+ alpacav2,BLZ_240312,aggregate,holistic,kendall,random,5,5,0.9999999999999999,0.016666666666666666
248
+ alpacav2,BLZ_240312,aggregate,holistic,kendall,random,5,6,0.7999999999999999,0.08333333333333333
249
+ alpacav2,BLZ_240312,aggregate,holistic,kendall,random,5,7,0.9999999999999999,0.016666666666666666
250
+ alpacav2,BLZ_240312,aggregate,holistic,kendall,random,5,8,0.9999999999999999,0.016666666666666666
251
+ alpacav2,BLZ_240312,aggregate,holistic,kendall,random,5,9,0.9999999999999999,0.016666666666666666
252
+ alpacaeval2_lc,BLZ_240312,aggregate,holistic,kendall,random,5,0,0.5270462766947299,0.206507295485425
253
+ alpacaeval2_lc,BLZ_240312,aggregate,holistic,kendall,random,5,1,0.39999999999999997,0.48333333333333334
254
+ alpacaeval2_lc,BLZ_240312,aggregate,holistic,kendall,random,5,2,0.7999999999999999,0.08333333333333333
255
+ alpacaeval2_lc,BLZ_240312,aggregate,holistic,kendall,random,5,3,0.6,0.23333333333333334
256
+ alpacaeval2_lc,BLZ_240312,aggregate,holistic,kendall,random,5,4,0.9999999999999999,0.016666666666666666
257
+ alpacaeval2_lc,BLZ_240312,aggregate,holistic,kendall,random,5,5,0.6,0.23333333333333334
258
+ alpacaeval2_lc,BLZ_240312,aggregate,holistic,kendall,random,5,6,0.7999999999999999,0.08333333333333333
259
+ alpacaeval2_lc,BLZ_240312,aggregate,holistic,kendall,random,5,7,0.6,0.23333333333333334
260
+ alpacaeval2_lc,BLZ_240312,aggregate,holistic,kendall,random,5,8,0.19999999999999998,0.8166666666666667
261
+ alpacaeval2_lc,BLZ_240312,aggregate,holistic,kendall,random,5,9,0.7999999999999999,0.08333333333333333
262
+ arena_elo,BLZ_240312,aggregate,holistic,kendall,random,5,0,0.9999999999999999,0.016666666666666666
263
+ arena_elo,BLZ_240312,aggregate,holistic,kendall,random,5,1,0.9999999999999999,0.016666666666666666
264
+ arena_elo,BLZ_240312,aggregate,holistic,kendall,random,5,2,1.0,0.019176729141549043
265
+ arena_elo,BLZ_240312,aggregate,holistic,kendall,random,5,3,0.9999999999999999,0.016666666666666666
266
+ arena_elo,BLZ_240312,aggregate,holistic,kendall,random,5,4,0.9999999999999999,0.016666666666666666
267
+ arena_elo,BLZ_240312,aggregate,holistic,kendall,random,5,5,0.9999999999999999,0.016666666666666666
268
+ arena_elo,BLZ_240312,aggregate,holistic,kendall,random,5,6,0.9999999999999999,0.016666666666666666
269
+ arena_elo,BLZ_240312,aggregate,holistic,kendall,random,5,7,0.9999999999999999,0.016666666666666666
270
+ arena_elo,BLZ_240312,aggregate,holistic,kendall,random,5,8,0.9999999999999999,0.016666666666666666
271
+ arena_elo,BLZ_240312,aggregate,holistic,kendall,random,5,9,0.9999999999999999,0.016666666666666666
272
+ bbh,BLZ_240312,aggregate,holistic,kendall,random,5,0,0.39999999999999997,0.48333333333333334
273
+ bbh,BLZ_240312,aggregate,holistic,kendall,random,5,1,0.9999999999999999,0.016666666666666666
274
+ bbh,BLZ_240312,aggregate,holistic,kendall,random,5,2,0.7999999999999999,0.08333333333333333
275
+ bbh,BLZ_240312,aggregate,holistic,kendall,random,5,3,0.7999999999999999,0.08333333333333333
276
+ bbh,BLZ_240312,aggregate,holistic,kendall,random,5,4,0.7999999999999999,0.08333333333333333
277
+ bbh,BLZ_240312,aggregate,holistic,kendall,random,5,5,0.7999999999999999,0.08333333333333333
278
+ bbh,BLZ_240312,aggregate,holistic,kendall,random,5,6,0.9486832980505137,0.02297740150320607
279
+ bbh,BLZ_240312,aggregate,holistic,kendall,random,5,7,0.19999999999999998,0.8166666666666667
280
+ bbh,BLZ_240312,aggregate,holistic,kendall,random,5,8,0.0,1.0
281
+ bbh,BLZ_240312,aggregate,holistic,kendall,random,5,9,0.9999999999999999,0.016666666666666666
282
+ eq_benchv2,BLZ_240312,aggregate,holistic,kendall,random,5,0,0.7999999999999999,0.08333333333333333
283
+ eq_benchv2,BLZ_240312,aggregate,holistic,kendall,random,5,1,0.6,0.23333333333333334
284
+ eq_benchv2,BLZ_240312,aggregate,holistic,kendall,random,5,2,0.6,0.23333333333333334
285
+ eq_benchv2,BLZ_240312,aggregate,holistic,kendall,random,5,3,0.9999999999999999,0.016666666666666666
286
+ eq_benchv2,BLZ_240312,aggregate,holistic,kendall,random,5,4,0.7999999999999999,0.08333333333333333
287
+ eq_benchv2,BLZ_240312,aggregate,holistic,kendall,random,5,5,0.6,0.23333333333333334
288
+ eq_benchv2,BLZ_240312,aggregate,holistic,kendall,random,5,6,0.6,0.23333333333333334
289
+ eq_benchv2,BLZ_240312,aggregate,holistic,kendall,random,5,7,0.5270462766947299,0.206507295485425
290
+ eq_benchv2,BLZ_240312,aggregate,holistic,kendall,random,5,8,0.7999999999999999,0.08333333333333333
291
+ eq_benchv2,BLZ_240312,aggregate,holistic,kendall,random,5,9,0.6,0.23333333333333334
292
+ gpt4all,BLZ_240312,aggregate,holistic,kendall,random,5,0,0.6,0.23333333333333334
293
+ gpt4all,BLZ_240312,aggregate,holistic,kendall,random,5,1,0.0,1.0
294
+ gpt4all,BLZ_240312,aggregate,holistic,kendall,random,5,2,0.7999999999999999,0.08333333333333333
295
+ gpt4all,BLZ_240312,aggregate,holistic,kendall,random,5,3,0.7999999999999999,0.08333333333333333
296
+ gpt4all,BLZ_240312,aggregate,holistic,kendall,random,5,4,0.6,0.23333333333333334
297
+ gpt4all,BLZ_240312,aggregate,holistic,kendall,random,5,5,0.7378647873726218,0.07697417298126676
298
+ gpt4all,BLZ_240312,aggregate,holistic,kendall,random,5,6,0.6,0.23333333333333334
299
+ gpt4all,BLZ_240312,aggregate,holistic,kendall,random,5,7,0.6,0.23333333333333334
300
+ gpt4all,BLZ_240312,aggregate,holistic,kendall,random,5,8,0.9999999999999999,0.016666666666666666
301
+ gpt4all,BLZ_240312,aggregate,holistic,kendall,random,5,9,0.39999999999999997,0.48333333333333334
302
+ hugging_6,BLZ_240312,aggregate,holistic,kendall,random,5,0,0.6,0.23333333333333334
303
+ hugging_6,BLZ_240312,aggregate,holistic,kendall,random,5,1,0.7999999999999999,0.08333333333333333
304
+ hugging_6,BLZ_240312,aggregate,holistic,kendall,random,5,2,0.19999999999999998,0.8166666666666667
305
+ hugging_6,BLZ_240312,aggregate,holistic,kendall,random,5,3,0.7999999999999999,0.08333333333333333
306
+ hugging_6,BLZ_240312,aggregate,holistic,kendall,random,5,4,0.6,0.23333333333333334
307
+ hugging_6,BLZ_240312,aggregate,holistic,kendall,random,5,5,0.39999999999999997,0.48333333333333334
308
+ hugging_6,BLZ_240312,aggregate,holistic,kendall,random,5,6,0.39999999999999997,0.48333333333333334
309
+ hugging_6,BLZ_240312,aggregate,holistic,kendall,random,5,7,0.6,0.23333333333333334
310
+ hugging_6,BLZ_240312,aggregate,holistic,kendall,random,5,8,0.0,1.0
311
+ hugging_6,BLZ_240312,aggregate,holistic,kendall,random,5,9,0.39999999999999997,0.48333333333333334
312
+ llmonitor,BLZ_240312,aggregate,holistic,kendall,random,5,0,0.8366600265340756,0.05220363534131463
313
+ llmonitor,BLZ_240312,aggregate,holistic,kendall,random,5,1,0.9999999999999999,0.016666666666666666
314
+ llmonitor,BLZ_240312,aggregate,holistic,kendall,random,5,2,0.7999999999999999,0.08333333333333333
315
+ llmonitor,BLZ_240312,aggregate,holistic,kendall,random,5,3,0.6,0.23333333333333334
316
+ llmonitor,BLZ_240312,aggregate,holistic,kendall,random,5,4,0.6,0.23333333333333334
317
+ llmonitor,BLZ_240312,aggregate,holistic,kendall,random,5,5,0.9999999999999999,0.016666666666666666
318
+ llmonitor,BLZ_240312,aggregate,holistic,kendall,random,5,6,0.7378647873726218,0.07697417298126676
319
+ llmonitor,BLZ_240312,aggregate,holistic,kendall,random,5,7,0.5270462766947298,0.206507295485425
320
+ llmonitor,BLZ_240312,aggregate,holistic,kendall,random,5,8,0.3333333333333333,0.4349833603383296
321
+ llmonitor,BLZ_240312,aggregate,holistic,kendall,random,5,9,0.39999999999999997,0.48333333333333334
322
+ magi,BLZ_240312,aggregate,holistic,kendall,random,5,0,0.7999999999999999,0.08333333333333333
323
+ magi,BLZ_240312,aggregate,holistic,kendall,random,5,1,0.5270462766947299,0.206507295485425
324
+ magi,BLZ_240312,aggregate,holistic,kendall,random,5,2,0.6,0.23333333333333334
325
+ magi,BLZ_240312,aggregate,holistic,kendall,random,5,3,0.7999999999999999,0.08333333333333333
326
+ magi,BLZ_240312,aggregate,holistic,kendall,random,5,4,0.7999999999999999,0.08333333333333333
327
+ magi,BLZ_240312,aggregate,holistic,kendall,random,5,5,0.9999999999999999,0.016666666666666666
328
+ magi,BLZ_240312,aggregate,holistic,kendall,random,5,6,0.10540925533894596,0.8005421074231263
329
+ magi,BLZ_240312,aggregate,holistic,kendall,random,5,7,0.39999999999999997,0.48333333333333334
330
+ magi,BLZ_240312,aggregate,holistic,kendall,random,5,8,0.31622776601683794,0.44848886103153174
331
+ magi,BLZ_240312,aggregate,holistic,kendall,random,5,9,0.9999999999999999,0.016666666666666666
332
+ mmlu,BLZ_240312,aggregate,holistic,kendall,random,5,0,0.39999999999999997,0.48333333333333334
333
+ mmlu,BLZ_240312,aggregate,holistic,kendall,random,5,1,0.7999999999999999,0.08333333333333333
334
+ mmlu,BLZ_240312,aggregate,holistic,kendall,random,5,2,0.19999999999999998,0.8166666666666667
335
+ mmlu,BLZ_240312,aggregate,holistic,kendall,random,5,3,0.9999999999999999,0.016666666666666666
336
+ mmlu,BLZ_240312,aggregate,holistic,kendall,random,5,4,0.39999999999999997,0.48333333333333334
337
+ mmlu,BLZ_240312,aggregate,holistic,kendall,random,5,5,0.6,0.23333333333333334
338
+ mmlu,BLZ_240312,aggregate,holistic,kendall,random,5,6,0.6,0.23333333333333334
339
+ mmlu,BLZ_240312,aggregate,holistic,kendall,random,5,7,0.9999999999999999,0.016666666666666666
340
+ mmlu,BLZ_240312,aggregate,holistic,kendall,random,5,8,0.6,0.23333333333333334
341
+ mmlu,BLZ_240312,aggregate,holistic,kendall,random,5,9,0.9999999999999999,0.016666666666666666
342
+ mt_bench,BLZ_240312,aggregate,holistic,kendall,random,5,0,0.7999999999999999,0.08333333333333333
343
+ mt_bench,BLZ_240312,aggregate,holistic,kendall,random,5,1,0.9999999999999999,0.016666666666666666
344
+ mt_bench,BLZ_240312,aggregate,holistic,kendall,random,5,2,0.7999999999999999,0.08333333333333333
345
+ mt_bench,BLZ_240312,aggregate,holistic,kendall,random,5,3,0.7999999999999999,0.08333333333333333
346
+ mt_bench,BLZ_240312,aggregate,holistic,kendall,random,5,4,0.9999999999999999,0.016666666666666666
347
+ mt_bench,BLZ_240312,aggregate,holistic,kendall,random,5,5,0.7999999999999999,0.08333333333333333
348
+ mt_bench,BLZ_240312,aggregate,holistic,kendall,random,5,6,0.9999999999999999,0.016666666666666666
349
+ mt_bench,BLZ_240312,aggregate,holistic,kendall,random,5,7,0.9999999999999999,0.016666666666666666
350
+ mt_bench,BLZ_240312,aggregate,holistic,kendall,random,5,8,0.9999999999999999,0.016666666666666666
351
+ mt_bench,BLZ_240312,aggregate,holistic,kendall,random,5,9,0.9999999999999999,0.016666666666666666
352
+ biggen_mwr,biggen_240612,aggregate,holistic,kendall,random,5,0,0.6,0.23333333333333334
353
+ biggen_mwr,biggen_240612,aggregate,holistic,kendall,random,5,1,0.39999999999999997,0.48333333333333334
354
+ biggen_mwr,biggen_240612,aggregate,holistic,kendall,random,5,2,0.0,1.0
355
+ biggen_mwr,biggen_240612,aggregate,holistic,kendall,random,5,3,0.6,0.23333333333333334
356
+ biggen_mwr,biggen_240612,aggregate,holistic,kendall,random,5,4,0.39999999999999997,0.48333333333333334
357
+ biggen_mwr,biggen_240612,aggregate,holistic,kendall,random,5,5,0.7999999999999999,0.08333333333333333
358
+ biggen_mwr,biggen_240612,aggregate,holistic,kendall,random,5,6,0.39999999999999997,0.48333333333333334
359
+ biggen_mwr,biggen_240612,aggregate,holistic,kendall,random,5,7,0.9999999999999999,0.016666666666666666
360
+ biggen_mwr,biggen_240612,aggregate,holistic,kendall,random,5,8,0.6,0.23333333333333334
361
+ biggen_mwr,biggen_240612,aggregate,holistic,kendall,random,5,9,0.6,0.23333333333333334
362
+ aggregate,holistic,hellaswag,BLZ_240312,kendall,random,5,0,0.39999999999999997,0.48333333333333334
363
+ aggregate,holistic,hellaswag,BLZ_240312,kendall,random,5,1,0.7999999999999999,0.08333333333333333
364
+ aggregate,holistic,hellaswag,BLZ_240312,kendall,random,5,2,0.7999999999999999,0.08333333333333333
365
+ aggregate,holistic,hellaswag,BLZ_240312,kendall,random,5,3,0.6,0.23333333333333334
366
+ aggregate,holistic,hellaswag,BLZ_240312,kendall,random,5,4,0.6,0.23333333333333334
367
+ aggregate,holistic,hellaswag,BLZ_240312,kendall,random,5,5,0.39999999999999997,0.48333333333333334
368
+ aggregate,holistic,hellaswag,BLZ_240312,kendall,random,5,6,0.39999999999999997,0.48333333333333334
369
+ aggregate,holistic,hellaswag,BLZ_240312,kendall,random,5,7,0.0,1.0
370
+ aggregate,holistic,hellaswag,BLZ_240312,kendall,random,5,8,0.19999999999999998,0.8166666666666667
371
+ aggregate,holistic,hellaswag,BLZ_240312,kendall,random,5,9,0.7999999999999999,0.08333333333333333
372
+ aggregate,holistic,humaneval,BLZ_240312,kendall,random,5,0,0.19999999999999998,0.8166666666666667
373
+ aggregate,holistic,humaneval,BLZ_240312,kendall,random,5,1,0.7999999999999999,0.08333333333333333
374
+ aggregate,holistic,humaneval,BLZ_240312,kendall,random,5,2,0.7999999999999999,0.08333333333333333
375
+ aggregate,holistic,humaneval,BLZ_240312,kendall,random,5,3,0.31622776601683794,0.44848886103153174
376
+ aggregate,holistic,humaneval,BLZ_240312,kendall,random,5,4,0.9999999999999999,0.016666666666666666
377
+ aggregate,holistic,humaneval,BLZ_240312,kendall,random,5,5,0.39999999999999997,0.48333333333333334
378
+ aggregate,holistic,humaneval,BLZ_240312,kendall,random,5,6,0.39999999999999997,0.48333333333333334
379
+ aggregate,holistic,humaneval,BLZ_240312,kendall,random,5,7,0.9999999999999999,0.016666666666666666
380
+ aggregate,holistic,humaneval,BLZ_240312,kendall,random,5,8,0.39999999999999997,0.48333333333333334
381
+ aggregate,holistic,humaneval,BLZ_240312,kendall,random,5,9,0.39999999999999997,0.48333333333333334
382
+ aggregate,holistic,mbpp,BLZ_240312,kendall,random,5,0,0.9999999999999999,0.016666666666666666
383
+ aggregate,holistic,mbpp,BLZ_240312,kendall,random,5,1,0.5270462766947298,0.206507295485425
384
+ aggregate,holistic,mbpp,BLZ_240312,kendall,random,5,2,0.6,0.23333333333333334
385
+ aggregate,holistic,mbpp,BLZ_240312,kendall,random,5,3,0.6,0.23333333333333334
386
+ aggregate,holistic,mbpp,BLZ_240312,kendall,random,5,4,0.7999999999999999,0.08333333333333333
387
+ aggregate,holistic,mbpp,BLZ_240312,kendall,random,5,5,0.5270462766947298,0.206507295485425
388
+ aggregate,holistic,mbpp,BLZ_240312,kendall,random,5,6,0.7999999999999999,0.08333333333333333
389
+ aggregate,holistic,mbpp,BLZ_240312,kendall,random,5,7,0.39999999999999997,0.48333333333333334
390
+ aggregate,holistic,mbpp,BLZ_240312,kendall,random,5,8,0.39999999999999997,0.48333333333333334
391
+ aggregate,holistic,mbpp,BLZ_240312,kendall,random,5,9,-0.10540925533894598,0.8005421074231263
392
+ aggregate,holistic,winogrande,BLZ_240312,kendall,random,5,0,-0.10540925533894596,0.8005421074231263
393
+ aggregate,holistic,winogrande,BLZ_240312,kendall,random,5,1,0.7999999999999999,0.08333333333333333
394
+ aggregate,holistic,winogrande,BLZ_240312,kendall,random,5,2,0.7999999999999999,0.08333333333333333
395
+ aggregate,holistic,winogrande,BLZ_240312,kendall,random,5,3,0.9999999999999999,0.016666666666666666
396
+ aggregate,holistic,winogrande,BLZ_240312,kendall,random,5,4,0.19999999999999998,0.8166666666666667
397
+ aggregate,holistic,winogrande,BLZ_240312,kendall,random,5,5,0.7999999999999999,0.08333333333333333
398
+ aggregate,holistic,winogrande,BLZ_240312,kendall,random,5,6,0.39999999999999997,0.48333333333333334
399
+ aggregate,holistic,winogrande,BLZ_240312,kendall,random,5,7,0.0,1.0
400
+ aggregate,holistic,winogrande,BLZ_240312,kendall,random,5,8,0.0,1.0
401
+ aggregate,holistic,winogrande,BLZ_240312,kendall,random,5,9,0.7999999999999999,0.08333333333333333
402
+ aggregate,holistic,grounding,biggen_240612,kendall,random,5,0,0.39999999999999997,0.48333333333333334
403
+ aggregate,holistic,grounding,biggen_240612,kendall,random,5,1,0.9999999999999999,0.016666666666666666
404
+ aggregate,holistic,grounding,biggen_240612,kendall,random,5,2,0.39999999999999997,0.48333333333333334
405
+ aggregate,holistic,grounding,biggen_240612,kendall,random,5,3,0.9999999999999999,0.016666666666666666
406
+ aggregate,holistic,grounding,biggen_240612,kendall,random,5,4,0.7999999999999999,0.08333333333333333
407
+ aggregate,holistic,grounding,biggen_240612,kendall,random,5,5,0.9486832980505137,0.02297740150320607
408
+ aggregate,holistic,grounding,biggen_240612,kendall,random,5,6,0.9999999999999999,0.016666666666666666
409
+ aggregate,holistic,grounding,biggen_240612,kendall,random,5,7,0.19999999999999998,0.8166666666666667
410
+ aggregate,holistic,grounding,biggen_240612,kendall,random,5,8,0.9999999999999999,0.016666666666666666
411
+ aggregate,holistic,grounding,biggen_240612,kendall,random,5,9,0.9999999999999999,0.016666666666666666
412
+ aggregate,holistic,instruction_following,biggen_240612,kendall,random,5,0,0.19999999999999998,0.8166666666666667
413
+ aggregate,holistic,instruction_following,biggen_240612,kendall,random,5,1,0.9999999999999999,0.016666666666666666
414
+ aggregate,holistic,instruction_following,biggen_240612,kendall,random,5,2,0.0,1.0
415
+ aggregate,holistic,instruction_following,biggen_240612,kendall,random,5,3,0.9999999999999999,0.016666666666666666
416
+ aggregate,holistic,instruction_following,biggen_240612,kendall,random,5,4,0.7999999999999999,0.08333333333333333
417
+ aggregate,holistic,instruction_following,biggen_240612,kendall,random,5,5,0.9486832980505137,0.02297740150320607
418
+ aggregate,holistic,instruction_following,biggen_240612,kendall,random,5,6,0.9999999999999999,0.016666666666666666
419
+ aggregate,holistic,instruction_following,biggen_240612,kendall,random,5,7,-0.10540925533894598,0.8005421074231263
420
+ aggregate,holistic,instruction_following,biggen_240612,kendall,random,5,8,0.9486832980505137,0.02297740150320607
421
+ aggregate,holistic,instruction_following,biggen_240612,kendall,random,5,9,0.9999999999999999,0.016666666666666666
422
+ aggregate,holistic,planning,biggen_240612,kendall,random,5,0,0.6,0.23333333333333334
423
+ aggregate,holistic,planning,biggen_240612,kendall,random,5,1,0.6,0.23333333333333334
424
+ aggregate,holistic,planning,biggen_240612,kendall,random,5,2,0.6,0.23333333333333334
425
+ aggregate,holistic,planning,biggen_240612,kendall,random,5,3,0.6,0.23333333333333334
426
+ aggregate,holistic,planning,biggen_240612,kendall,random,5,4,0.9999999999999999,0.016666666666666666
427
+ aggregate,holistic,planning,biggen_240612,kendall,random,5,5,0.39999999999999997,0.48333333333333334
428
+ aggregate,holistic,planning,biggen_240612,kendall,random,5,6,0.6,0.23333333333333334
429
+ aggregate,holistic,planning,biggen_240612,kendall,random,5,7,0.19999999999999998,0.8166666666666667
430
+ aggregate,holistic,planning,biggen_240612,kendall,random,5,8,0.9999999999999999,0.016666666666666666
431
+ aggregate,holistic,planning,biggen_240612,kendall,random,5,9,0.6,0.23333333333333334
432
+ aggregate,holistic,reasoning,biggen_240612,kendall,random,5,0,0.39999999999999997,0.48333333333333334
433
+ aggregate,holistic,reasoning,biggen_240612,kendall,random,5,1,0.7999999999999999,0.08333333333333333
434
+ aggregate,holistic,reasoning,biggen_240612,kendall,random,5,2,0.7378647873726218,0.07697417298126676
435
+ aggregate,holistic,reasoning,biggen_240612,kendall,random,5,3,0.9999999999999999,0.016666666666666666
436
+ aggregate,holistic,reasoning,biggen_240612,kendall,random,5,4,0.9999999999999999,0.016666666666666666
437
+ aggregate,holistic,reasoning,biggen_240612,kendall,random,5,5,0.9999999999999999,0.016666666666666666
438
+ aggregate,holistic,reasoning,biggen_240612,kendall,random,5,6,0.9999999999999999,0.016666666666666666
439
+ aggregate,holistic,reasoning,biggen_240612,kendall,random,5,7,0.39999999999999997,0.48333333333333334
440
+ aggregate,holistic,reasoning,biggen_240612,kendall,random,5,8,0.7999999999999999,0.08333333333333333
441
+ aggregate,holistic,reasoning,biggen_240612,kendall,random,5,9,0.9999999999999999,0.016666666666666666
442
+ aggregate,holistic,refinement,biggen_240612,kendall,random,5,0,0.6,0.23333333333333334
443
+ aggregate,holistic,refinement,biggen_240612,kendall,random,5,1,0.7999999999999999,0.08333333333333333
444
+ aggregate,holistic,refinement,biggen_240612,kendall,random,5,2,0.19999999999999998,0.8166666666666667
445
+ aggregate,holistic,refinement,biggen_240612,kendall,random,5,3,0.7999999999999999,0.08333333333333333
446
+ aggregate,holistic,refinement,biggen_240612,kendall,random,5,4,0.6,0.23333333333333334
447
+ aggregate,holistic,refinement,biggen_240612,kendall,random,5,5,0.39999999999999997,0.48333333333333334
448
+ aggregate,holistic,refinement,biggen_240612,kendall,random,5,6,0.9999999999999999,0.016666666666666666
449
+ aggregate,holistic,refinement,biggen_240612,kendall,random,5,7,-0.19999999999999998,0.8166666666666667
450
+ aggregate,holistic,refinement,biggen_240612,kendall,random,5,8,0.7999999999999999,0.08333333333333333
451
+ aggregate,holistic,refinement,biggen_240612,kendall,random,5,9,0.9999999999999999,0.016666666666666666
452
+ aggregate,holistic,safety,biggen_240612,kendall,random,5,0,0.39999999999999997,0.48333333333333334
453
+ aggregate,holistic,safety,biggen_240612,kendall,random,5,1,0.19999999999999998,0.8166666666666667
454
+ aggregate,holistic,safety,biggen_240612,kendall,random,5,2,-0.6,0.23333333333333334
455
+ aggregate,holistic,safety,biggen_240612,kendall,random,5,3,0.0,1.0
456
+ aggregate,holistic,safety,biggen_240612,kendall,random,5,4,0.39999999999999997,0.48333333333333334
457
+ aggregate,holistic,safety,biggen_240612,kendall,random,5,5,0.0,1.0
458
+ aggregate,holistic,safety,biggen_240612,kendall,random,5,6,0.39999999999999997,0.48333333333333334
459
+ aggregate,holistic,safety,biggen_240612,kendall,random,5,7,-0.6,0.23333333333333334
460
+ aggregate,holistic,safety,biggen_240612,kendall,random,5,8,-0.19999999999999998,0.8166666666666667
461
+ aggregate,holistic,safety,biggen_240612,kendall,random,5,9,0.39999999999999997,0.48333333333333334
462
+ aggregate,holistic,theory_of_mind,biggen_240612,kendall,random,5,0,0.6,0.23333333333333334
463
+ aggregate,holistic,theory_of_mind,biggen_240612,kendall,random,5,1,0.9999999999999999,0.016666666666666666
464
+ aggregate,holistic,theory_of_mind,biggen_240612,kendall,random,5,2,0.19999999999999998,0.8166666666666667
465
+ aggregate,holistic,theory_of_mind,biggen_240612,kendall,random,5,3,0.7999999999999999,0.08333333333333333
466
+ aggregate,holistic,theory_of_mind,biggen_240612,kendall,random,5,4,0.9999999999999999,0.016666666666666666
467
+ aggregate,holistic,theory_of_mind,biggen_240612,kendall,random,5,5,0.7999999999999999,0.08333333333333333
468
+ aggregate,holistic,theory_of_mind,biggen_240612,kendall,random,5,6,0.9999999999999999,0.016666666666666666
469
+ aggregate,holistic,theory_of_mind,biggen_240612,kendall,random,5,7,-0.19999999999999998,0.8166666666666667
470
+ aggregate,holistic,theory_of_mind,biggen_240612,kendall,random,5,8,0.9999999999999999,0.016666666666666666
471
+ aggregate,holistic,theory_of_mind,biggen_240612,kendall,random,5,9,0.9999999999999999,0.016666666666666666
472
+ aggregate,holistic,tool_usage,biggen_240612,kendall,random,5,0,0.6,0.23333333333333334
473
+ aggregate,holistic,tool_usage,biggen_240612,kendall,random,5,1,0.7999999999999999,0.08333333333333333
474
+ aggregate,holistic,tool_usage,biggen_240612,kendall,random,5,2,0.9999999999999999,0.016666666666666666
475
+ aggregate,holistic,tool_usage,biggen_240612,kendall,random,5,3,0.7999999999999999,0.08333333333333333
476
+ aggregate,holistic,tool_usage,biggen_240612,kendall,random,5,4,0.6,0.23333333333333334
477
+ aggregate,holistic,tool_usage,biggen_240612,kendall,random,5,5,0.6,0.23333333333333334
478
+ aggregate,holistic,tool_usage,biggen_240612,kendall,random,5,6,0.7999999999999999,0.08333333333333333
479
+ aggregate,holistic,tool_usage,biggen_240612,kendall,random,5,7,0.0,1.0
480
+ aggregate,holistic,tool_usage,biggen_240612,kendall,random,5,8,0.7999999999999999,0.08333333333333333
481
+ aggregate,holistic,tool_usage,biggen_240612,kendall,random,5,9,0.9999999999999999,0.016666666666666666
482
+ aggregate,holistic,livebench_average,livebench_240701,kendall,random,5,0,0.39999999999999997,0.48333333333333334
483
+ aggregate,holistic,livebench_average,livebench_240701,kendall,random,5,1,0.7999999999999999,0.08333333333333333
484
+ aggregate,holistic,livebench_average,livebench_240701,kendall,random,5,2,0.6,0.23333333333333334
485
+ aggregate,holistic,livebench_average,livebench_240701,kendall,random,5,3,0.39999999999999997,0.48333333333333334
486
+ aggregate,holistic,livebench_average,livebench_240701,kendall,random,5,4,0.6,0.23333333333333334
487
+ aggregate,holistic,livebench_average,livebench_240701,kendall,random,5,5,0.7999999999999999,0.08333333333333333
488
+ aggregate,holistic,livebench_average,livebench_240701,kendall,random,5,6,0.6,0.23333333333333334
489
+ aggregate,holistic,livebench_average,livebench_240701,kendall,random,5,7,0.7999999999999999,0.08333333333333333
490
+ aggregate,holistic,livebench_average,livebench_240701,kendall,random,5,8,0.6,0.23333333333333334
491
+ aggregate,holistic,livebench_average,livebench_240701,kendall,random,5,9,0.7999999999999999,0.08333333333333333
492
+ aggregate,holistic,reasoning_average,livebench_240701,kendall,random,5,0,0.6,0.23333333333333334
493
+ aggregate,holistic,reasoning_average,livebench_240701,kendall,random,5,1,0.7378647873726218,0.07697417298126676
494
+ aggregate,holistic,reasoning_average,livebench_240701,kendall,random,5,2,0.6,0.23333333333333334
495
+ aggregate,holistic,reasoning_average,livebench_240701,kendall,random,5,3,0.5270462766947298,0.206507295485425
496
+ aggregate,holistic,reasoning_average,livebench_240701,kendall,random,5,4,0.7378647873726218,0.07697417298126676
497
+ aggregate,holistic,reasoning_average,livebench_240701,kendall,random,5,5,0.7378647873726218,0.07697417298126676
498
+ aggregate,holistic,reasoning_average,livebench_240701,kendall,random,5,6,0.7378647873726218,0.07697417298126676
499
+ aggregate,holistic,reasoning_average,livebench_240701,kendall,random,5,7,0.7999999999999999,0.08333333333333333
500
+ aggregate,holistic,reasoning_average,livebench_240701,kendall,random,5,8,0.7378647873726218,0.07697417298126676
501
+ aggregate,holistic,reasoning_average,livebench_240701,kendall,random,5,9,0.7378647873726218,0.07697417298126676
502
+ aggregate,holistic,coding_average,livebench_240701,kendall,random,5,0,0.39999999999999997,0.48333333333333334
503
+ aggregate,holistic,coding_average,livebench_240701,kendall,random,5,1,0.7999999999999999,0.08333333333333333
504
+ aggregate,holistic,coding_average,livebench_240701,kendall,random,5,2,0.6,0.23333333333333334
505
+ aggregate,holistic,coding_average,livebench_240701,kendall,random,5,3,0.31622776601683794,0.44848886103153174
506
+ aggregate,holistic,coding_average,livebench_240701,kendall,random,5,4,0.5270462766947298,0.206507295485425
507
+ aggregate,holistic,coding_average,livebench_240701,kendall,random,5,5,0.7999999999999999,0.08333333333333333
508
+ aggregate,holistic,coding_average,livebench_240701,kendall,random,5,6,0.5270462766947298,0.206507295485425
509
+ aggregate,holistic,coding_average,livebench_240701,kendall,random,5,7,0.7378647873726218,0.07697417298126676
510
+ aggregate,holistic,coding_average,livebench_240701,kendall,random,5,8,0.5270462766947298,0.206507295485425
511
+ aggregate,holistic,coding_average,livebench_240701,kendall,random,5,9,0.7999999999999999,0.08333333333333333
512
+ aggregate,holistic,mathematics_average,livebench_240701,kendall,random,5,0,0.39999999999999997,0.48333333333333334
513
+ aggregate,holistic,mathematics_average,livebench_240701,kendall,random,5,1,0.9999999999999999,0.016666666666666666
514
+ aggregate,holistic,mathematics_average,livebench_240701,kendall,random,5,2,0.6,0.23333333333333334
515
+ aggregate,holistic,mathematics_average,livebench_240701,kendall,random,5,3,0.39999999999999997,0.48333333333333334
516
+ aggregate,holistic,mathematics_average,livebench_240701,kendall,random,5,4,0.7999999999999999,0.08333333333333333
517
+ aggregate,holistic,mathematics_average,livebench_240701,kendall,random,5,5,0.9999999999999999,0.016666666666666666
518
+ aggregate,holistic,mathematics_average,livebench_240701,kendall,random,5,6,0.6,0.23333333333333334
519
+ aggregate,holistic,mathematics_average,livebench_240701,kendall,random,5,7,0.7999999999999999,0.08333333333333333
520
+ aggregate,holistic,mathematics_average,livebench_240701,kendall,random,5,8,0.7999999999999999,0.08333333333333333
521
+ aggregate,holistic,mathematics_average,livebench_240701,kendall,random,5,9,0.9999999999999999,0.016666666666666666
522
+ aggregate,holistic,data_analysis_average,livebench_240701,kendall,random,5,0,0.6,0.23333333333333334
523
+ aggregate,holistic,data_analysis_average,livebench_240701,kendall,random,5,1,0.7999999999999999,0.08333333333333333
524
+ aggregate,holistic,data_analysis_average,livebench_240701,kendall,random,5,2,0.7999999999999999,0.08333333333333333
525
+ aggregate,holistic,data_analysis_average,livebench_240701,kendall,random,5,3,0.6,0.23333333333333334
526
+ aggregate,holistic,data_analysis_average,livebench_240701,kendall,random,5,4,0.7999999999999999,0.08333333333333333
527
+ aggregate,holistic,data_analysis_average,livebench_240701,kendall,random,5,5,0.7999999999999999,0.08333333333333333
528
+ aggregate,holistic,data_analysis_average,livebench_240701,kendall,random,5,6,0.9999999999999999,0.016666666666666666
529
+ aggregate,holistic,data_analysis_average,livebench_240701,kendall,random,5,7,0.9999999999999999,0.016666666666666666
530
+ aggregate,holistic,data_analysis_average,livebench_240701,kendall,random,5,8,0.7999999999999999,0.08333333333333333
531
+ aggregate,holistic,data_analysis_average,livebench_240701,kendall,random,5,9,0.7999999999999999,0.08333333333333333
532
+ aggregate,holistic,language_average,livebench_240701,kendall,random,5,0,0.6,0.23333333333333334
533
+ aggregate,holistic,language_average,livebench_240701,kendall,random,5,1,0.6,0.23333333333333334
534
+ aggregate,holistic,language_average,livebench_240701,kendall,random,5,2,0.6,0.23333333333333334
535
+ aggregate,holistic,language_average,livebench_240701,kendall,random,5,3,0.6,0.23333333333333334
536
+ aggregate,holistic,language_average,livebench_240701,kendall,random,5,4,0.7999999999999999,0.08333333333333333
537
+ aggregate,holistic,language_average,livebench_240701,kendall,random,5,5,0.6,0.23333333333333334
538
+ aggregate,holistic,language_average,livebench_240701,kendall,random,5,6,0.7999999999999999,0.08333333333333333
539
+ aggregate,holistic,language_average,livebench_240701,kendall,random,5,7,0.7999999999999999,0.08333333333333333
540
+ aggregate,holistic,language_average,livebench_240701,kendall,random,5,8,0.7999999999999999,0.08333333333333333
541
+ aggregate,holistic,language_average,livebench_240701,kendall,random,5,9,0.6,0.23333333333333334
542
+ aggregate,holistic,if_average,livebench_240701,kendall,random,5,0,0.39999999999999997,0.48333333333333334
543
+ aggregate,holistic,if_average,livebench_240701,kendall,random,5,1,0.6,0.23333333333333334
544
+ aggregate,holistic,if_average,livebench_240701,kendall,random,5,2,0.7999999999999999,0.08333333333333333
545
+ aggregate,holistic,if_average,livebench_240701,kendall,random,5,3,0.0,1.0
546
+ aggregate,holistic,if_average,livebench_240701,kendall,random,5,4,0.0,1.0
547
+ aggregate,holistic,if_average,livebench_240701,kendall,random,5,5,0.6,0.23333333333333334
548
+ aggregate,holistic,if_average,livebench_240701,kendall,random,5,6,0.19999999999999998,0.8166666666666667
549
+ aggregate,holistic,if_average,livebench_240701,kendall,random,5,7,0.39999999999999997,0.48333333333333334
550
+ aggregate,holistic,if_average,livebench_240701,kendall,random,5,8,0.0,1.0
551
+ aggregate,holistic,if_average,livebench_240701,kendall,random,5,9,0.6,0.23333333333333334
552
+ aggregate,holistic,arena_hard,arena_hard_2404,kendall,random,5,0,0.9999999999999999,0.016666666666666666
553
+ aggregate,holistic,arena_hard,arena_hard_2404,kendall,random,5,1,0.39999999999999997,0.48333333333333334
554
+ aggregate,holistic,arena_hard,arena_hard_2404,kendall,random,5,2,0.7999999999999999,0.08333333333333333
555
+ aggregate,holistic,arena_hard,arena_hard_2404,kendall,random,5,3,0.9999999999999999,0.016666666666666666
556
+ aggregate,holistic,arena_hard,arena_hard_2404,kendall,random,5,4,0.7999999999999999,0.08333333333333333
557
+ aggregate,holistic,arena_hard,arena_hard_2404,kendall,random,5,5,0.6,0.23333333333333334
558
+ aggregate,holistic,arena_hard,arena_hard_2404,kendall,random,5,6,0.9999999999999999,0.016666666666666666
559
+ aggregate,holistic,arena_hard,arena_hard_2404,kendall,random,5,7,0.7999999999999999,0.08333333333333333
560
+ aggregate,holistic,arena_hard,arena_hard_2404,kendall,random,5,8,0.9999999999999999,0.016666666666666666
561
+ aggregate,holistic,arena_hard,arena_hard_2404,kendall,random,5,9,0.9486832980505138,0.02297740150320607
562
+ aggregate,holistic,mixeval,mixeval_240601,kendall,random,5,0,0.7999999999999999,0.08333333333333333
563
+ aggregate,holistic,mixeval,mixeval_240601,kendall,random,5,1,0.9999999999999999,0.016666666666666666
564
+ aggregate,holistic,mixeval,mixeval_240601,kendall,random,5,2,0.7999999999999999,0.08333333333333333
565
+ aggregate,holistic,mixeval,mixeval_240601,kendall,random,5,3,0.9999999999999999,0.016666666666666666
566
+ aggregate,holistic,mixeval,mixeval_240601,kendall,random,5,4,0.7999999999999999,0.08333333333333333
567
+ aggregate,holistic,mixeval,mixeval_240601,kendall,random,5,5,0.9999999999999999,0.016666666666666666
568
+ aggregate,holistic,mixeval,mixeval_240601,kendall,random,5,6,0.7999999999999999,0.08333333333333333
569
+ aggregate,holistic,mixeval,mixeval_240601,kendall,random,5,7,0.9999999999999999,0.016666666666666666
570
+ aggregate,holistic,mixeval,mixeval_240601,kendall,random,5,8,0.7999999999999999,0.08333333333333333
571
+ aggregate,holistic,mixeval,mixeval_240601,kendall,random,5,9,0.9999999999999999,0.016666666666666666
572
+ aggregate,holistic,agieval,BLZ_240312,kendall,random,5,0,0.5270462766947299,0.206507295485425
573
+ aggregate,holistic,agieval,BLZ_240312,kendall,random,5,1,0.9999999999999999,0.016666666666666666
574
+ aggregate,holistic,agieval,BLZ_240312,kendall,random,5,2,0.6,0.23333333333333334
575
+ aggregate,holistic,agieval,BLZ_240312,kendall,random,5,3,0.7999999999999999,0.08333333333333333
576
+ aggregate,holistic,agieval,BLZ_240312,kendall,random,5,4,0.7999999999999999,0.08333333333333333
577
+ aggregate,holistic,agieval,BLZ_240312,kendall,random,5,5,0.7999999999999999,0.08333333333333333
578
+ aggregate,holistic,agieval,BLZ_240312,kendall,random,5,6,0.9999999999999999,0.016666666666666666
579
+ aggregate,holistic,agieval,BLZ_240312,kendall,random,5,7,0.6,0.23333333333333334
580
+ aggregate,holistic,agieval,BLZ_240312,kendall,random,5,8,0.19999999999999998,0.8166666666666667
581
+ aggregate,holistic,agieval,BLZ_240312,kendall,random,5,9,0.6,0.23333333333333334
582
+ aggregate,holistic,arc_c,BLZ_240312,kendall,random,5,0,0.5270462766947298,0.206507295485425
583
+ aggregate,holistic,arc_c,BLZ_240312,kendall,random,5,1,0.9999999999999999,0.016666666666666666
584
+ aggregate,holistic,arc_c,BLZ_240312,kendall,random,5,2,0.9999999999999999,0.016666666666666666
585
+ aggregate,holistic,arc_c,BLZ_240312,kendall,random,5,3,0.9999999999999999,0.016666666666666666
586
+ aggregate,holistic,arc_c,BLZ_240312,kendall,random,5,4,0.7999999999999999,0.08333333333333333
587
+ aggregate,holistic,arc_c,BLZ_240312,kendall,random,5,5,0.7999999999999999,0.08333333333333333
588
+ aggregate,holistic,arc_c,BLZ_240312,kendall,random,5,6,0.7999999999999999,0.08333333333333333
589
+ aggregate,holistic,arc_c,BLZ_240312,kendall,random,5,7,0.9999999999999999,0.016666666666666666
590
+ aggregate,holistic,arc_c,BLZ_240312,kendall,random,5,8,0.19999999999999998,0.8166666666666667
591
+ aggregate,holistic,arc_c,BLZ_240312,kendall,random,5,9,0.7999999999999999,0.08333333333333333
592
+ aggregate,holistic,alpacav1,BLZ_240312,kendall,random,5,0,0.19999999999999998,0.8166666666666667
593
+ aggregate,holistic,alpacav1,BLZ_240312,kendall,random,5,1,0.9999999999999999,0.016666666666666666
594
+ aggregate,holistic,alpacav1,BLZ_240312,kendall,random,5,2,0.6,0.23333333333333334
595
+ aggregate,holistic,alpacav1,BLZ_240312,kendall,random,5,3,0.9999999999999999,0.016666666666666666
596
+ aggregate,holistic,alpacav1,BLZ_240312,kendall,random,5,4,0.7999999999999999,0.08333333333333333
597
+ aggregate,holistic,alpacav1,BLZ_240312,kendall,random,5,5,0.39999999999999997,0.48333333333333334
598
+ aggregate,holistic,alpacav1,BLZ_240312,kendall,random,5,6,0.19999999999999998,0.8166666666666667
599
+ aggregate,holistic,alpacav1,BLZ_240312,kendall,random,5,7,0.39999999999999997,0.48333333333333334
600
+ aggregate,holistic,alpacav1,BLZ_240312,kendall,random,5,8,0.9999999999999999,0.016666666666666666
601
+ aggregate,holistic,alpacav1,BLZ_240312,kendall,random,5,9,0.7999999999999999,0.08333333333333333
602
+ aggregate,holistic,alpacav2,BLZ_240312,kendall,random,5,0,0.6,0.23333333333333334
603
+ aggregate,holistic,alpacav2,BLZ_240312,kendall,random,5,1,0.7999999999999999,0.08333333333333333
604
+ aggregate,holistic,alpacav2,BLZ_240312,kendall,random,5,2,0.7999999999999999,0.08333333333333333
605
+ aggregate,holistic,alpacav2,BLZ_240312,kendall,random,5,3,0.9999999999999999,0.016666666666666666
606
+ aggregate,holistic,alpacav2,BLZ_240312,kendall,random,5,4,0.7999999999999999,0.08333333333333333
607
+ aggregate,holistic,alpacav2,BLZ_240312,kendall,random,5,5,0.9999999999999999,0.016666666666666666
608
+ aggregate,holistic,alpacav2,BLZ_240312,kendall,random,5,6,0.7999999999999999,0.08333333333333333
609
+ aggregate,holistic,alpacav2,BLZ_240312,kendall,random,5,7,0.9999999999999999,0.016666666666666666
610
+ aggregate,holistic,alpacav2,BLZ_240312,kendall,random,5,8,0.9999999999999999,0.016666666666666666
611
+ aggregate,holistic,alpacav2,BLZ_240312,kendall,random,5,9,0.9999999999999999,0.016666666666666666
612
+ aggregate,holistic,alpacaeval2_lc,BLZ_240312,kendall,random,5,0,0.5270462766947299,0.206507295485425
613
+ aggregate,holistic,alpacaeval2_lc,BLZ_240312,kendall,random,5,1,0.39999999999999997,0.48333333333333334
614
+ aggregate,holistic,alpacaeval2_lc,BLZ_240312,kendall,random,5,2,0.7999999999999999,0.08333333333333333
615
+ aggregate,holistic,alpacaeval2_lc,BLZ_240312,kendall,random,5,3,0.6,0.23333333333333334
616
+ aggregate,holistic,alpacaeval2_lc,BLZ_240312,kendall,random,5,4,0.9999999999999999,0.016666666666666666
617
+ aggregate,holistic,alpacaeval2_lc,BLZ_240312,kendall,random,5,5,0.6,0.23333333333333334
618
+ aggregate,holistic,alpacaeval2_lc,BLZ_240312,kendall,random,5,6,0.7999999999999999,0.08333333333333333
619
+ aggregate,holistic,alpacaeval2_lc,BLZ_240312,kendall,random,5,7,0.6,0.23333333333333334
620
+ aggregate,holistic,alpacaeval2_lc,BLZ_240312,kendall,random,5,8,0.19999999999999998,0.8166666666666667
621
+ aggregate,holistic,alpacaeval2_lc,BLZ_240312,kendall,random,5,9,0.7999999999999999,0.08333333333333333
622
+ aggregate,holistic,arena_elo,BLZ_240312,kendall,random,5,0,0.9999999999999999,0.016666666666666666
623
+ aggregate,holistic,arena_elo,BLZ_240312,kendall,random,5,1,0.9999999999999999,0.016666666666666666
624
+ aggregate,holistic,arena_elo,BLZ_240312,kendall,random,5,2,1.0,0.019176729141549043
625
+ aggregate,holistic,arena_elo,BLZ_240312,kendall,random,5,3,0.9999999999999999,0.016666666666666666
626
+ aggregate,holistic,arena_elo,BLZ_240312,kendall,random,5,4,0.9999999999999999,0.016666666666666666
627
+ aggregate,holistic,arena_elo,BLZ_240312,kendall,random,5,5,0.9999999999999999,0.016666666666666666
628
+ aggregate,holistic,arena_elo,BLZ_240312,kendall,random,5,6,0.9999999999999999,0.016666666666666666
629
+ aggregate,holistic,arena_elo,BLZ_240312,kendall,random,5,7,0.9999999999999999,0.016666666666666666
630
+ aggregate,holistic,arena_elo,BLZ_240312,kendall,random,5,8,0.9999999999999999,0.016666666666666666
631
+ aggregate,holistic,arena_elo,BLZ_240312,kendall,random,5,9,0.9999999999999999,0.016666666666666666
632
+ aggregate,holistic,bbh,BLZ_240312,kendall,random,5,0,0.39999999999999997,0.48333333333333334
633
+ aggregate,holistic,bbh,BLZ_240312,kendall,random,5,1,0.9999999999999999,0.016666666666666666
634
+ aggregate,holistic,bbh,BLZ_240312,kendall,random,5,2,0.7999999999999999,0.08333333333333333
635
+ aggregate,holistic,bbh,BLZ_240312,kendall,random,5,3,0.7999999999999999,0.08333333333333333
636
+ aggregate,holistic,bbh,BLZ_240312,kendall,random,5,4,0.7999999999999999,0.08333333333333333
637
+ aggregate,holistic,bbh,BLZ_240312,kendall,random,5,5,0.7999999999999999,0.08333333333333333
638
+ aggregate,holistic,bbh,BLZ_240312,kendall,random,5,6,0.9486832980505137,0.02297740150320607
639
+ aggregate,holistic,bbh,BLZ_240312,kendall,random,5,7,0.19999999999999998,0.8166666666666667
640
+ aggregate,holistic,bbh,BLZ_240312,kendall,random,5,8,0.0,1.0
641
+ aggregate,holistic,bbh,BLZ_240312,kendall,random,5,9,0.9999999999999999,0.016666666666666666
642
+ aggregate,holistic,eq_benchv2,BLZ_240312,kendall,random,5,0,0.7999999999999999,0.08333333333333333
643
+ aggregate,holistic,eq_benchv2,BLZ_240312,kendall,random,5,1,0.6,0.23333333333333334
644
+ aggregate,holistic,eq_benchv2,BLZ_240312,kendall,random,5,2,0.6,0.23333333333333334
645
+ aggregate,holistic,eq_benchv2,BLZ_240312,kendall,random,5,3,0.9999999999999999,0.016666666666666666
646
+ aggregate,holistic,eq_benchv2,BLZ_240312,kendall,random,5,4,0.7999999999999999,0.08333333333333333
647
+ aggregate,holistic,eq_benchv2,BLZ_240312,kendall,random,5,5,0.6,0.23333333333333334
648
+ aggregate,holistic,eq_benchv2,BLZ_240312,kendall,random,5,6,0.6,0.23333333333333334
649
+ aggregate,holistic,eq_benchv2,BLZ_240312,kendall,random,5,7,0.5270462766947299,0.206507295485425
650
+ aggregate,holistic,eq_benchv2,BLZ_240312,kendall,random,5,8,0.7999999999999999,0.08333333333333333
651
+ aggregate,holistic,eq_benchv2,BLZ_240312,kendall,random,5,9,0.6,0.23333333333333334
652
+ aggregate,holistic,gpt4all,BLZ_240312,kendall,random,5,0,0.6,0.23333333333333334
653
+ aggregate,holistic,gpt4all,BLZ_240312,kendall,random,5,1,0.0,1.0
654
+ aggregate,holistic,gpt4all,BLZ_240312,kendall,random,5,2,0.7999999999999999,0.08333333333333333
655
+ aggregate,holistic,gpt4all,BLZ_240312,kendall,random,5,3,0.7999999999999999,0.08333333333333333
656
+ aggregate,holistic,gpt4all,BLZ_240312,kendall,random,5,4,0.6,0.23333333333333334
657
+ aggregate,holistic,gpt4all,BLZ_240312,kendall,random,5,5,0.7378647873726218,0.07697417298126676
658
+ aggregate,holistic,gpt4all,BLZ_240312,kendall,random,5,6,0.6,0.23333333333333334
659
+ aggregate,holistic,gpt4all,BLZ_240312,kendall,random,5,7,0.6,0.23333333333333334
660
+ aggregate,holistic,gpt4all,BLZ_240312,kendall,random,5,8,0.9999999999999999,0.016666666666666666
661
+ aggregate,holistic,gpt4all,BLZ_240312,kendall,random,5,9,0.39999999999999997,0.48333333333333334
662
+ aggregate,holistic,hugging_6,BLZ_240312,kendall,random,5,0,0.6,0.23333333333333334
663
+ aggregate,holistic,hugging_6,BLZ_240312,kendall,random,5,1,0.7999999999999999,0.08333333333333333
664
+ aggregate,holistic,hugging_6,BLZ_240312,kendall,random,5,2,0.19999999999999998,0.8166666666666667
665
+ aggregate,holistic,hugging_6,BLZ_240312,kendall,random,5,3,0.7999999999999999,0.08333333333333333
666
+ aggregate,holistic,hugging_6,BLZ_240312,kendall,random,5,4,0.6,0.23333333333333334
667
+ aggregate,holistic,hugging_6,BLZ_240312,kendall,random,5,5,0.39999999999999997,0.48333333333333334
668
+ aggregate,holistic,hugging_6,BLZ_240312,kendall,random,5,6,0.39999999999999997,0.48333333333333334
669
+ aggregate,holistic,hugging_6,BLZ_240312,kendall,random,5,7,0.6,0.23333333333333334
670
+ aggregate,holistic,hugging_6,BLZ_240312,kendall,random,5,8,0.0,1.0
671
+ aggregate,holistic,hugging_6,BLZ_240312,kendall,random,5,9,0.39999999999999997,0.48333333333333334
672
+ aggregate,holistic,llmonitor,BLZ_240312,kendall,random,5,0,0.8366600265340756,0.05220363534131463
673
+ aggregate,holistic,llmonitor,BLZ_240312,kendall,random,5,1,0.9999999999999999,0.016666666666666666
674
+ aggregate,holistic,llmonitor,BLZ_240312,kendall,random,5,2,0.7999999999999999,0.08333333333333333
675
+ aggregate,holistic,llmonitor,BLZ_240312,kendall,random,5,3,0.6,0.23333333333333334
676
+ aggregate,holistic,llmonitor,BLZ_240312,kendall,random,5,4,0.6,0.23333333333333334
677
+ aggregate,holistic,llmonitor,BLZ_240312,kendall,random,5,5,0.9999999999999999,0.016666666666666666
678
+ aggregate,holistic,llmonitor,BLZ_240312,kendall,random,5,6,0.7378647873726218,0.07697417298126676
679
+ aggregate,holistic,llmonitor,BLZ_240312,kendall,random,5,7,0.5270462766947298,0.206507295485425
680
+ aggregate,holistic,llmonitor,BLZ_240312,kendall,random,5,8,0.3333333333333333,0.4349833603383296
681
+ aggregate,holistic,llmonitor,BLZ_240312,kendall,random,5,9,0.39999999999999997,0.48333333333333334
682
+ aggregate,holistic,magi,BLZ_240312,kendall,random,5,0,0.7999999999999999,0.08333333333333333
683
+ aggregate,holistic,magi,BLZ_240312,kendall,random,5,1,0.5270462766947299,0.206507295485425
684
+ aggregate,holistic,magi,BLZ_240312,kendall,random,5,2,0.6,0.23333333333333334
685
+ aggregate,holistic,magi,BLZ_240312,kendall,random,5,3,0.7999999999999999,0.08333333333333333
686
+ aggregate,holistic,magi,BLZ_240312,kendall,random,5,4,0.7999999999999999,0.08333333333333333
687
+ aggregate,holistic,magi,BLZ_240312,kendall,random,5,5,0.9999999999999999,0.016666666666666666
688
+ aggregate,holistic,magi,BLZ_240312,kendall,random,5,6,0.10540925533894596,0.8005421074231263
689
+ aggregate,holistic,magi,BLZ_240312,kendall,random,5,7,0.39999999999999997,0.48333333333333334
690
+ aggregate,holistic,magi,BLZ_240312,kendall,random,5,8,0.31622776601683794,0.44848886103153174
691
+ aggregate,holistic,magi,BLZ_240312,kendall,random,5,9,0.9999999999999999,0.016666666666666666
692
+ aggregate,holistic,mmlu,BLZ_240312,kendall,random,5,0,0.39999999999999997,0.48333333333333334
693
+ aggregate,holistic,mmlu,BLZ_240312,kendall,random,5,1,0.7999999999999999,0.08333333333333333
694
+ aggregate,holistic,mmlu,BLZ_240312,kendall,random,5,2,0.19999999999999998,0.8166666666666667
695
+ aggregate,holistic,mmlu,BLZ_240312,kendall,random,5,3,0.9999999999999999,0.016666666666666666
696
+ aggregate,holistic,mmlu,BLZ_240312,kendall,random,5,4,0.39999999999999997,0.48333333333333334
697
+ aggregate,holistic,mmlu,BLZ_240312,kendall,random,5,5,0.6,0.23333333333333334
698
+ aggregate,holistic,mmlu,BLZ_240312,kendall,random,5,6,0.6,0.23333333333333334
699
+ aggregate,holistic,mmlu,BLZ_240312,kendall,random,5,7,0.9999999999999999,0.016666666666666666
700
+ aggregate,holistic,mmlu,BLZ_240312,kendall,random,5,8,0.6,0.23333333333333334
701
+ aggregate,holistic,mmlu,BLZ_240312,kendall,random,5,9,0.9999999999999999,0.016666666666666666
702
+ aggregate,holistic,mt_bench,BLZ_240312,kendall,random,5,0,0.7999999999999999,0.08333333333333333
703
+ aggregate,holistic,mt_bench,BLZ_240312,kendall,random,5,1,0.9999999999999999,0.016666666666666666
704
+ aggregate,holistic,mt_bench,BLZ_240312,kendall,random,5,2,0.7999999999999999,0.08333333333333333
705
+ aggregate,holistic,mt_bench,BLZ_240312,kendall,random,5,3,0.7999999999999999,0.08333333333333333
706
+ aggregate,holistic,mt_bench,BLZ_240312,kendall,random,5,4,0.9999999999999999,0.016666666666666666
707
+ aggregate,holistic,mt_bench,BLZ_240312,kendall,random,5,5,0.7999999999999999,0.08333333333333333
708
+ aggregate,holistic,mt_bench,BLZ_240312,kendall,random,5,6,0.9999999999999999,0.016666666666666666
709
+ aggregate,holistic,mt_bench,BLZ_240312,kendall,random,5,7,0.9999999999999999,0.016666666666666666
710
+ aggregate,holistic,mt_bench,BLZ_240312,kendall,random,5,8,0.9999999999999999,0.016666666666666666
711
+ aggregate,holistic,mt_bench,BLZ_240312,kendall,random,5,9,0.9999999999999999,0.016666666666666666
712
+ aggregate,holistic,biggen_mwr,biggen_240612,kendall,random,5,0,0.6,0.23333333333333334
713
+ aggregate,holistic,biggen_mwr,biggen_240612,kendall,random,5,1,0.39999999999999997,0.48333333333333334
714
+ aggregate,holistic,biggen_mwr,biggen_240612,kendall,random,5,2,0.0,1.0
715
+ aggregate,holistic,biggen_mwr,biggen_240612,kendall,random,5,3,0.6,0.23333333333333334
716
+ aggregate,holistic,biggen_mwr,biggen_240612,kendall,random,5,4,0.39999999999999997,0.48333333333333334
717
+ aggregate,holistic,biggen_mwr,biggen_240612,kendall,random,5,5,0.7999999999999999,0.08333333333333333
718
+ aggregate,holistic,biggen_mwr,biggen_240612,kendall,random,5,6,0.39999999999999997,0.48333333333333334
719
+ aggregate,holistic,biggen_mwr,biggen_240612,kendall,random,5,7,0.9999999999999999,0.016666666666666666
720
+ aggregate,holistic,biggen_mwr,biggen_240612,kendall,random,5,8,0.6,0.23333333333333334
721
+ aggregate,holistic,biggen_mwr,biggen_240612,kendall,random,5,9,0.6,0.23333333333333334
cache/agreements_cache_9aca1000dd25da3a044f5fd80fad0266.csv ADDED
@@ -0,0 +1,721 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ scenario,scenario_source,ref_scenario,ref_source,corr_type,model_select_strategy,model_subset_size_requested,exp_n,correlation,p_value
2
+ hellaswag,BLZ_240312,aggregate,holistic,kendall,random,5,0,0.9999999999999999,0.016666666666666666
3
+ hellaswag,BLZ_240312,aggregate,holistic,kendall,random,5,1,0.39999999999999997,0.48333333333333334
4
+ hellaswag,BLZ_240312,aggregate,holistic,kendall,random,5,2,0.7999999999999999,0.08333333333333333
5
+ hellaswag,BLZ_240312,aggregate,holistic,kendall,random,5,3,0.39999999999999997,0.48333333333333334
6
+ hellaswag,BLZ_240312,aggregate,holistic,kendall,random,5,4,0.7999999999999999,0.08333333333333333
7
+ hellaswag,BLZ_240312,aggregate,holistic,kendall,random,5,5,0.6,0.23333333333333334
8
+ hellaswag,BLZ_240312,aggregate,holistic,kendall,random,5,6,0.39999999999999997,0.48333333333333334
9
+ hellaswag,BLZ_240312,aggregate,holistic,kendall,random,5,7,0.7999999999999999,0.08333333333333333
10
+ hellaswag,BLZ_240312,aggregate,holistic,kendall,random,5,8,0.9999999999999999,0.016666666666666666
11
+ hellaswag,BLZ_240312,aggregate,holistic,kendall,random,5,9,0.7999999999999999,0.08333333333333333
12
+ humaneval,BLZ_240312,aggregate,holistic,kendall,random,5,0,0.39999999999999997,0.48333333333333334
13
+ humaneval,BLZ_240312,aggregate,holistic,kendall,random,5,1,0.7999999999999999,0.08333333333333333
14
+ humaneval,BLZ_240312,aggregate,holistic,kendall,random,5,2,0.7999999999999999,0.08333333333333333
15
+ humaneval,BLZ_240312,aggregate,holistic,kendall,random,5,3,0.39999999999999997,0.48333333333333334
16
+ humaneval,BLZ_240312,aggregate,holistic,kendall,random,5,4,0.9999999999999999,0.016666666666666666
17
+ humaneval,BLZ_240312,aggregate,holistic,kendall,random,5,5,0.6,0.23333333333333334
18
+ humaneval,BLZ_240312,aggregate,holistic,kendall,random,5,6,0.6,0.23333333333333334
19
+ humaneval,BLZ_240312,aggregate,holistic,kendall,random,5,7,0.6,0.23333333333333334
20
+ humaneval,BLZ_240312,aggregate,holistic,kendall,random,5,8,0.6,0.23333333333333334
21
+ humaneval,BLZ_240312,aggregate,holistic,kendall,random,5,9,0.6,0.23333333333333334
22
+ mbpp,BLZ_240312,aggregate,holistic,kendall,random,5,0,0.6,0.23333333333333334
23
+ mbpp,BLZ_240312,aggregate,holistic,kendall,random,5,1,0.39999999999999997,0.48333333333333334
24
+ mbpp,BLZ_240312,aggregate,holistic,kendall,random,5,2,0.6,0.23333333333333334
25
+ mbpp,BLZ_240312,aggregate,holistic,kendall,random,5,3,0.10540925533894598,0.8005421074231263
26
+ mbpp,BLZ_240312,aggregate,holistic,kendall,random,5,4,0.19999999999999998,0.8166666666666667
27
+ mbpp,BLZ_240312,aggregate,holistic,kendall,random,5,5,0.39999999999999997,0.48333333333333334
28
+ mbpp,BLZ_240312,aggregate,holistic,kendall,random,5,6,0.10540925533894598,0.8005421074231263
29
+ mbpp,BLZ_240312,aggregate,holistic,kendall,random,5,7,0.10540925533894598,0.8005421074231263
30
+ mbpp,BLZ_240312,aggregate,holistic,kendall,random,5,8,-0.5270462766947298,0.206507295485425
31
+ mbpp,BLZ_240312,aggregate,holistic,kendall,random,5,9,-0.39999999999999997,0.48333333333333334
32
+ winogrande,BLZ_240312,aggregate,holistic,kendall,random,5,0,0.6,0.23333333333333334
33
+ winogrande,BLZ_240312,aggregate,holistic,kendall,random,5,1,0.7999999999999999,0.08333333333333333
34
+ winogrande,BLZ_240312,aggregate,holistic,kendall,random,5,2,0.7999999999999999,0.08333333333333333
35
+ winogrande,BLZ_240312,aggregate,holistic,kendall,random,5,3,0.9999999999999999,0.016666666666666666
36
+ winogrande,BLZ_240312,aggregate,holistic,kendall,random,5,4,0.7999999999999999,0.08333333333333333
37
+ winogrande,BLZ_240312,aggregate,holistic,kendall,random,5,5,0.7999999999999999,0.08333333333333333
38
+ winogrande,BLZ_240312,aggregate,holistic,kendall,random,5,6,0.39999999999999997,0.48333333333333334
39
+ winogrande,BLZ_240312,aggregate,holistic,kendall,random,5,7,0.39999999999999997,0.48333333333333334
40
+ winogrande,BLZ_240312,aggregate,holistic,kendall,random,5,8,0.7999999999999999,0.08333333333333333
41
+ winogrande,BLZ_240312,aggregate,holistic,kendall,random,5,9,0.9999999999999999,0.016666666666666666
42
+ grounding,biggen_240612,aggregate,holistic,kendall,random,5,0,0.9486832980505137,0.02297740150320607
43
+ grounding,biggen_240612,aggregate,holistic,kendall,random,5,1,0.9999999999999999,0.016666666666666666
44
+ grounding,biggen_240612,aggregate,holistic,kendall,random,5,2,0.6,0.23333333333333334
45
+ grounding,biggen_240612,aggregate,holistic,kendall,random,5,3,0.7999999999999999,0.08333333333333333
46
+ grounding,biggen_240612,aggregate,holistic,kendall,random,5,4,0.6,0.23333333333333334
47
+ grounding,biggen_240612,aggregate,holistic,kendall,random,5,5,0.7378647873726218,0.07697417298126676
48
+ grounding,biggen_240612,aggregate,holistic,kendall,random,5,6,0.6,0.23333333333333334
49
+ grounding,biggen_240612,aggregate,holistic,kendall,random,5,7,0.9999999999999999,0.016666666666666666
50
+ grounding,biggen_240612,aggregate,holistic,kendall,random,5,8,0.9999999999999999,0.016666666666666666
51
+ grounding,biggen_240612,aggregate,holistic,kendall,random,5,9,0.7999999999999999,0.08333333333333333
52
+ instruction_following,biggen_240612,aggregate,holistic,kendall,random,5,0,0.9486832980505137,0.02297740150320607
53
+ instruction_following,biggen_240612,aggregate,holistic,kendall,random,5,1,0.7999999999999999,0.08333333333333333
54
+ instruction_following,biggen_240612,aggregate,holistic,kendall,random,5,2,0.19999999999999998,0.8166666666666667
55
+ instruction_following,biggen_240612,aggregate,holistic,kendall,random,5,3,0.6,0.23333333333333334
56
+ instruction_following,biggen_240612,aggregate,holistic,kendall,random,5,4,0.9486832980505137,0.02297740150320607
57
+ instruction_following,biggen_240612,aggregate,holistic,kendall,random,5,5,0.7378647873726218,0.07697417298126676
58
+ instruction_following,biggen_240612,aggregate,holistic,kendall,random,5,6,0.9999999999999999,0.016666666666666666
59
+ instruction_following,biggen_240612,aggregate,holistic,kendall,random,5,7,0.7999999999999999,0.08333333333333333
60
+ instruction_following,biggen_240612,aggregate,holistic,kendall,random,5,8,0.7999999999999999,0.08333333333333333
61
+ instruction_following,biggen_240612,aggregate,holistic,kendall,random,5,9,0.9999999999999999,0.016666666666666666
62
+ planning,biggen_240612,aggregate,holistic,kendall,random,5,0,0.0,1.0
63
+ planning,biggen_240612,aggregate,holistic,kendall,random,5,1,0.7999999999999999,0.08333333333333333
64
+ planning,biggen_240612,aggregate,holistic,kendall,random,5,2,0.7999999999999999,0.08333333333333333
65
+ planning,biggen_240612,aggregate,holistic,kendall,random,5,3,0.9999999999999999,0.016666666666666666
66
+ planning,biggen_240612,aggregate,holistic,kendall,random,5,4,0.6,0.23333333333333334
67
+ planning,biggen_240612,aggregate,holistic,kendall,random,5,5,0.7999999999999999,0.08333333333333333
68
+ planning,biggen_240612,aggregate,holistic,kendall,random,5,6,0.6,0.23333333333333334
69
+ planning,biggen_240612,aggregate,holistic,kendall,random,5,7,0.7999999999999999,0.08333333333333333
70
+ planning,biggen_240612,aggregate,holistic,kendall,random,5,8,0.39999999999999997,0.48333333333333334
71
+ planning,biggen_240612,aggregate,holistic,kendall,random,5,9,0.9999999999999999,0.016666666666666666
72
+ reasoning,biggen_240612,aggregate,holistic,kendall,random,5,0,0.7999999999999999,0.08333333333333333
73
+ reasoning,biggen_240612,aggregate,holistic,kendall,random,5,1,0.7999999999999999,0.08333333333333333
74
+ reasoning,biggen_240612,aggregate,holistic,kendall,random,5,2,0.6,0.23333333333333334
75
+ reasoning,biggen_240612,aggregate,holistic,kendall,random,5,3,0.7999999999999999,0.08333333333333333
76
+ reasoning,biggen_240612,aggregate,holistic,kendall,random,5,4,0.39999999999999997,0.48333333333333334
77
+ reasoning,biggen_240612,aggregate,holistic,kendall,random,5,5,0.9999999999999999,0.016666666666666666
78
+ reasoning,biggen_240612,aggregate,holistic,kendall,random,5,6,0.7999999999999999,0.08333333333333333
79
+ reasoning,biggen_240612,aggregate,holistic,kendall,random,5,7,0.6,0.23333333333333334
80
+ reasoning,biggen_240612,aggregate,holistic,kendall,random,5,8,0.7999999999999999,0.08333333333333333
81
+ reasoning,biggen_240612,aggregate,holistic,kendall,random,5,9,0.6,0.23333333333333334
82
+ refinement,biggen_240612,aggregate,holistic,kendall,random,5,0,0.39999999999999997,0.48333333333333334
83
+ refinement,biggen_240612,aggregate,holistic,kendall,random,5,1,0.7999999999999999,0.08333333333333333
84
+ refinement,biggen_240612,aggregate,holistic,kendall,random,5,2,0.39999999999999997,0.48333333333333334
85
+ refinement,biggen_240612,aggregate,holistic,kendall,random,5,3,0.9999999999999999,0.016666666666666666
86
+ refinement,biggen_240612,aggregate,holistic,kendall,random,5,4,0.6,0.23333333333333334
87
+ refinement,biggen_240612,aggregate,holistic,kendall,random,5,5,0.7999999999999999,0.08333333333333333
88
+ refinement,biggen_240612,aggregate,holistic,kendall,random,5,6,0.7999999999999999,0.08333333333333333
89
+ refinement,biggen_240612,aggregate,holistic,kendall,random,5,7,0.7999999999999999,0.08333333333333333
90
+ refinement,biggen_240612,aggregate,holistic,kendall,random,5,8,0.7999999999999999,0.08333333333333333
91
+ refinement,biggen_240612,aggregate,holistic,kendall,random,5,9,0.9999999999999999,0.016666666666666666
92
+ safety,biggen_240612,aggregate,holistic,kendall,random,5,0,0.6,0.23333333333333334
93
+ safety,biggen_240612,aggregate,holistic,kendall,random,5,1,0.0,1.0
94
+ safety,biggen_240612,aggregate,holistic,kendall,random,5,2,-0.39999999999999997,0.48333333333333334
95
+ safety,biggen_240612,aggregate,holistic,kendall,random,5,3,0.6,0.23333333333333334
96
+ safety,biggen_240612,aggregate,holistic,kendall,random,5,4,-0.6,0.23333333333333334
97
+ safety,biggen_240612,aggregate,holistic,kendall,random,5,5,0.39999999999999997,0.48333333333333334
98
+ safety,biggen_240612,aggregate,holistic,kendall,random,5,6,-0.19999999999999998,0.8166666666666667
99
+ safety,biggen_240612,aggregate,holistic,kendall,random,5,7,-0.39999999999999997,0.48333333333333334
100
+ safety,biggen_240612,aggregate,holistic,kendall,random,5,8,-0.19999999999999998,0.8166666666666667
101
+ safety,biggen_240612,aggregate,holistic,kendall,random,5,9,0.0,1.0
102
+ theory_of_mind,biggen_240612,aggregate,holistic,kendall,random,5,0,0.7999999999999999,0.08333333333333333
103
+ theory_of_mind,biggen_240612,aggregate,holistic,kendall,random,5,1,0.6,0.23333333333333334
104
+ theory_of_mind,biggen_240612,aggregate,holistic,kendall,random,5,2,0.39999999999999997,0.48333333333333334
105
+ theory_of_mind,biggen_240612,aggregate,holistic,kendall,random,5,3,0.7999999999999999,0.08333333333333333
106
+ theory_of_mind,biggen_240612,aggregate,holistic,kendall,random,5,4,0.39999999999999997,0.48333333333333334
107
+ theory_of_mind,biggen_240612,aggregate,holistic,kendall,random,5,5,0.7999999999999999,0.08333333333333333
108
+ theory_of_mind,biggen_240612,aggregate,holistic,kendall,random,5,6,0.9999999999999999,0.016666666666666666
109
+ theory_of_mind,biggen_240612,aggregate,holistic,kendall,random,5,7,0.7999999999999999,0.08333333333333333
110
+ theory_of_mind,biggen_240612,aggregate,holistic,kendall,random,5,8,0.7999999999999999,0.08333333333333333
111
+ theory_of_mind,biggen_240612,aggregate,holistic,kendall,random,5,9,0.9999999999999999,0.016666666666666666
112
+ tool_usage,biggen_240612,aggregate,holistic,kendall,random,5,0,0.19999999999999998,0.8166666666666667
113
+ tool_usage,biggen_240612,aggregate,holistic,kendall,random,5,1,0.9999999999999999,0.016666666666666666
114
+ tool_usage,biggen_240612,aggregate,holistic,kendall,random,5,2,0.7999999999999999,0.08333333333333333
115
+ tool_usage,biggen_240612,aggregate,holistic,kendall,random,5,3,0.6,0.23333333333333334
116
+ tool_usage,biggen_240612,aggregate,holistic,kendall,random,5,4,0.9999999999999999,0.016666666666666666
117
+ tool_usage,biggen_240612,aggregate,holistic,kendall,random,5,5,0.6,0.23333333333333334
118
+ tool_usage,biggen_240612,aggregate,holistic,kendall,random,5,6,0.7999999999999999,0.08333333333333333
119
+ tool_usage,biggen_240612,aggregate,holistic,kendall,random,5,7,0.6,0.23333333333333334
120
+ tool_usage,biggen_240612,aggregate,holistic,kendall,random,5,8,0.7999999999999999,0.08333333333333333
121
+ tool_usage,biggen_240612,aggregate,holistic,kendall,random,5,9,0.7999999999999999,0.08333333333333333
122
+ livebench_average,livebench_240701,aggregate,holistic,kendall,random,5,0,0.6,0.23333333333333334
123
+ livebench_average,livebench_240701,aggregate,holistic,kendall,random,5,1,0.7999999999999999,0.08333333333333333
124
+ livebench_average,livebench_240701,aggregate,holistic,kendall,random,5,2,0.6,0.23333333333333334
125
+ livebench_average,livebench_240701,aggregate,holistic,kendall,random,5,3,0.39999999999999997,0.48333333333333334
126
+ livebench_average,livebench_240701,aggregate,holistic,kendall,random,5,4,0.6,0.23333333333333334
127
+ livebench_average,livebench_240701,aggregate,holistic,kendall,random,5,5,0.7999999999999999,0.08333333333333333
128
+ livebench_average,livebench_240701,aggregate,holistic,kendall,random,5,6,0.6,0.23333333333333334
129
+ livebench_average,livebench_240701,aggregate,holistic,kendall,random,5,7,0.7999999999999999,0.08333333333333333
130
+ livebench_average,livebench_240701,aggregate,holistic,kendall,random,5,8,0.6,0.23333333333333334
131
+ livebench_average,livebench_240701,aggregate,holistic,kendall,random,5,9,0.7999999999999999,0.08333333333333333
132
+ reasoning_average,livebench_240701,aggregate,holistic,kendall,random,5,0,0.7378647873726218,0.07697417298126676
133
+ reasoning_average,livebench_240701,aggregate,holistic,kendall,random,5,1,0.7999999999999999,0.08333333333333333
134
+ reasoning_average,livebench_240701,aggregate,holistic,kendall,random,5,2,0.7378647873726218,0.07697417298126676
135
+ reasoning_average,livebench_240701,aggregate,holistic,kendall,random,5,3,0.5270462766947298,0.206507295485425
136
+ reasoning_average,livebench_240701,aggregate,holistic,kendall,random,5,4,0.7999999999999999,0.08333333333333333
137
+ reasoning_average,livebench_240701,aggregate,holistic,kendall,random,5,5,0.7999999999999999,0.08333333333333333
138
+ reasoning_average,livebench_240701,aggregate,holistic,kendall,random,5,6,0.7999999999999999,0.08333333333333333
139
+ reasoning_average,livebench_240701,aggregate,holistic,kendall,random,5,7,0.7999999999999999,0.08333333333333333
140
+ reasoning_average,livebench_240701,aggregate,holistic,kendall,random,5,8,0.7999999999999999,0.08333333333333333
141
+ reasoning_average,livebench_240701,aggregate,holistic,kendall,random,5,9,0.7999999999999999,0.08333333333333333
142
+ coding_average,livebench_240701,aggregate,holistic,kendall,random,5,0,0.5270462766947298,0.206507295485425
143
+ coding_average,livebench_240701,aggregate,holistic,kendall,random,5,1,0.7378647873726218,0.07697417298126676
144
+ coding_average,livebench_240701,aggregate,holistic,kendall,random,5,2,0.5270462766947298,0.206507295485425
145
+ coding_average,livebench_240701,aggregate,holistic,kendall,random,5,3,0.31622776601683794,0.44848886103153174
146
+ coding_average,livebench_240701,aggregate,holistic,kendall,random,5,4,0.6,0.23333333333333334
147
+ coding_average,livebench_240701,aggregate,holistic,kendall,random,5,5,0.7378647873726218,0.07697417298126676
148
+ coding_average,livebench_240701,aggregate,holistic,kendall,random,5,6,0.6,0.23333333333333334
149
+ coding_average,livebench_240701,aggregate,holistic,kendall,random,5,7,0.7999999999999999,0.08333333333333333
150
+ coding_average,livebench_240701,aggregate,holistic,kendall,random,5,8,0.6,0.23333333333333334
151
+ coding_average,livebench_240701,aggregate,holistic,kendall,random,5,9,0.7999999999999999,0.08333333333333333
152
+ mathematics_average,livebench_240701,aggregate,holistic,kendall,random,5,0,0.6,0.23333333333333334
153
+ mathematics_average,livebench_240701,aggregate,holistic,kendall,random,5,1,0.9999999999999999,0.016666666666666666
154
+ mathematics_average,livebench_240701,aggregate,holistic,kendall,random,5,2,0.7999999999999999,0.08333333333333333
155
+ mathematics_average,livebench_240701,aggregate,holistic,kendall,random,5,3,0.6,0.23333333333333334
156
+ mathematics_average,livebench_240701,aggregate,holistic,kendall,random,5,4,0.6,0.23333333333333334
157
+ mathematics_average,livebench_240701,aggregate,holistic,kendall,random,5,5,0.9999999999999999,0.016666666666666666
158
+ mathematics_average,livebench_240701,aggregate,holistic,kendall,random,5,6,0.6,0.23333333333333334
159
+ mathematics_average,livebench_240701,aggregate,holistic,kendall,random,5,7,0.7999999999999999,0.08333333333333333
160
+ mathematics_average,livebench_240701,aggregate,holistic,kendall,random,5,8,0.6,0.23333333333333334
161
+ mathematics_average,livebench_240701,aggregate,holistic,kendall,random,5,9,0.9999999999999999,0.016666666666666666
162
+ data_analysis_average,livebench_240701,aggregate,holistic,kendall,random,5,0,0.7999999999999999,0.08333333333333333
163
+ data_analysis_average,livebench_240701,aggregate,holistic,kendall,random,5,1,0.7999999999999999,0.08333333333333333
164
+ data_analysis_average,livebench_240701,aggregate,holistic,kendall,random,5,2,0.7999999999999999,0.08333333333333333
165
+ data_analysis_average,livebench_240701,aggregate,holistic,kendall,random,5,3,0.6,0.23333333333333334
166
+ data_analysis_average,livebench_240701,aggregate,holistic,kendall,random,5,4,0.7999999999999999,0.08333333333333333
167
+ data_analysis_average,livebench_240701,aggregate,holistic,kendall,random,5,5,0.7999999999999999,0.08333333333333333
168
+ data_analysis_average,livebench_240701,aggregate,holistic,kendall,random,5,6,0.7999999999999999,0.08333333333333333
169
+ data_analysis_average,livebench_240701,aggregate,holistic,kendall,random,5,7,0.7999999999999999,0.08333333333333333
170
+ data_analysis_average,livebench_240701,aggregate,holistic,kendall,random,5,8,0.7999999999999999,0.08333333333333333
171
+ data_analysis_average,livebench_240701,aggregate,holistic,kendall,random,5,9,0.7999999999999999,0.08333333333333333
172
+ language_average,livebench_240701,aggregate,holistic,kendall,random,5,0,0.7999999999999999,0.08333333333333333
173
+ language_average,livebench_240701,aggregate,holistic,kendall,random,5,1,0.6,0.23333333333333334
174
+ language_average,livebench_240701,aggregate,holistic,kendall,random,5,2,0.6,0.23333333333333334
175
+ language_average,livebench_240701,aggregate,holistic,kendall,random,5,3,0.6,0.23333333333333334
176
+ language_average,livebench_240701,aggregate,holistic,kendall,random,5,4,0.7999999999999999,0.08333333333333333
177
+ language_average,livebench_240701,aggregate,holistic,kendall,random,5,5,0.6,0.23333333333333334
178
+ language_average,livebench_240701,aggregate,holistic,kendall,random,5,6,0.6,0.23333333333333334
179
+ language_average,livebench_240701,aggregate,holistic,kendall,random,5,7,0.6,0.23333333333333334
180
+ language_average,livebench_240701,aggregate,holistic,kendall,random,5,8,0.7999999999999999,0.08333333333333333
181
+ language_average,livebench_240701,aggregate,holistic,kendall,random,5,9,0.6,0.23333333333333334
182
+ if_average,livebench_240701,aggregate,holistic,kendall,random,5,0,-0.39999999999999997,0.48333333333333334
183
+ if_average,livebench_240701,aggregate,holistic,kendall,random,5,1,0.19999999999999998,0.8166666666666667
184
+ if_average,livebench_240701,aggregate,holistic,kendall,random,5,2,-0.19999999999999998,0.8166666666666667
185
+ if_average,livebench_240701,aggregate,holistic,kendall,random,5,3,-0.6,0.23333333333333334
186
+ if_average,livebench_240701,aggregate,holistic,kendall,random,5,4,0.19999999999999998,0.8166666666666667
187
+ if_average,livebench_240701,aggregate,holistic,kendall,random,5,5,0.19999999999999998,0.8166666666666667
188
+ if_average,livebench_240701,aggregate,holistic,kendall,random,5,6,0.0,1.0
189
+ if_average,livebench_240701,aggregate,holistic,kendall,random,5,7,0.39999999999999997,0.48333333333333334
190
+ if_average,livebench_240701,aggregate,holistic,kendall,random,5,8,0.19999999999999998,0.8166666666666667
191
+ if_average,livebench_240701,aggregate,holistic,kendall,random,5,9,0.6,0.23333333333333334
192
+ arena_hard,arena_hard_2404,aggregate,holistic,kendall,random,5,0,0.39999999999999997,0.48333333333333334
193
+ arena_hard,arena_hard_2404,aggregate,holistic,kendall,random,5,1,0.7999999999999999,0.08333333333333333
194
+ arena_hard,arena_hard_2404,aggregate,holistic,kendall,random,5,2,0.9999999999999999,0.016666666666666666
195
+ arena_hard,arena_hard_2404,aggregate,holistic,kendall,random,5,3,0.7999999999999999,0.08333333333333333
196
+ arena_hard,arena_hard_2404,aggregate,holistic,kendall,random,5,4,0.7999999999999999,0.08333333333333333
197
+ arena_hard,arena_hard_2404,aggregate,holistic,kendall,random,5,5,0.39999999999999997,0.48333333333333334
198
+ arena_hard,arena_hard_2404,aggregate,holistic,kendall,random,5,6,0.7999999999999999,0.08333333333333333
199
+ arena_hard,arena_hard_2404,aggregate,holistic,kendall,random,5,7,0.9999999999999999,0.016666666666666666
200
+ arena_hard,arena_hard_2404,aggregate,holistic,kendall,random,5,8,0.9999999999999999,0.016666666666666666
201
+ arena_hard,arena_hard_2404,aggregate,holistic,kendall,random,5,9,0.6,0.23333333333333334
202
+ mixeval,mixeval_240601,aggregate,holistic,kendall,random,5,0,0.7999999999999999,0.08333333333333333
203
+ mixeval,mixeval_240601,aggregate,holistic,kendall,random,5,1,0.9999999999999999,0.016666666666666666
204
+ mixeval,mixeval_240601,aggregate,holistic,kendall,random,5,2,0.7999999999999999,0.08333333333333333
205
+ mixeval,mixeval_240601,aggregate,holistic,kendall,random,5,3,0.7999999999999999,0.08333333333333333
206
+ mixeval,mixeval_240601,aggregate,holistic,kendall,random,5,4,0.7999999999999999,0.08333333333333333
207
+ mixeval,mixeval_240601,aggregate,holistic,kendall,random,5,5,0.9999999999999999,0.016666666666666666
208
+ mixeval,mixeval_240601,aggregate,holistic,kendall,random,5,6,0.9999999999999999,0.016666666666666666
209
+ mixeval,mixeval_240601,aggregate,holistic,kendall,random,5,7,0.7999999999999999,0.08333333333333333
210
+ mixeval,mixeval_240601,aggregate,holistic,kendall,random,5,8,0.7999999999999999,0.08333333333333333
211
+ mixeval,mixeval_240601,aggregate,holistic,kendall,random,5,9,0.7999999999999999,0.08333333333333333
212
+ agieval,BLZ_240312,aggregate,holistic,kendall,random,5,0,0.9999999999999999,0.016666666666666666
213
+ agieval,BLZ_240312,aggregate,holistic,kendall,random,5,1,0.9999999999999999,0.016666666666666666
214
+ agieval,BLZ_240312,aggregate,holistic,kendall,random,5,2,0.9999999999999999,0.016666666666666666
215
+ agieval,BLZ_240312,aggregate,holistic,kendall,random,5,3,0.9999999999999999,0.016666666666666666
216
+ agieval,BLZ_240312,aggregate,holistic,kendall,random,5,4,0.9999999999999999,0.016666666666666666
217
+ agieval,BLZ_240312,aggregate,holistic,kendall,random,5,5,0.7999999999999999,0.08333333333333333
218
+ agieval,BLZ_240312,aggregate,holistic,kendall,random,5,6,0.9999999999999999,0.016666666666666666
219
+ agieval,BLZ_240312,aggregate,holistic,kendall,random,5,7,0.7999999999999999,0.08333333333333333
220
+ agieval,BLZ_240312,aggregate,holistic,kendall,random,5,8,0.9999999999999999,0.016666666666666666
221
+ agieval,BLZ_240312,aggregate,holistic,kendall,random,5,9,0.9999999999999999,0.016666666666666666
222
+ arc_c,BLZ_240312,aggregate,holistic,kendall,random,5,0,0.9486832980505137,0.02297740150320607
223
+ arc_c,BLZ_240312,aggregate,holistic,kendall,random,5,1,0.9999999999999999,0.016666666666666666
224
+ arc_c,BLZ_240312,aggregate,holistic,kendall,random,5,2,0.9999999999999999,0.016666666666666666
225
+ arc_c,BLZ_240312,aggregate,holistic,kendall,random,5,3,0.7999999999999999,0.08333333333333333
226
+ arc_c,BLZ_240312,aggregate,holistic,kendall,random,5,4,0.9999999999999999,0.016666666666666666
227
+ arc_c,BLZ_240312,aggregate,holistic,kendall,random,5,5,0.6,0.23333333333333334
228
+ arc_c,BLZ_240312,aggregate,holistic,kendall,random,5,6,0.7999999999999999,0.08333333333333333
229
+ arc_c,BLZ_240312,aggregate,holistic,kendall,random,5,7,0.9999999999999999,0.016666666666666666
230
+ arc_c,BLZ_240312,aggregate,holistic,kendall,random,5,8,0.9999999999999999,0.016666666666666666
231
+ arc_c,BLZ_240312,aggregate,holistic,kendall,random,5,9,0.9999999999999999,0.016666666666666666
232
+ alpacav1,BLZ_240312,aggregate,holistic,kendall,random,5,0,0.7999999999999999,0.08333333333333333
233
+ alpacav1,BLZ_240312,aggregate,holistic,kendall,random,5,1,0.9999999999999999,0.016666666666666666
234
+ alpacav1,BLZ_240312,aggregate,holistic,kendall,random,5,2,0.39999999999999997,0.48333333333333334
235
+ alpacav1,BLZ_240312,aggregate,holistic,kendall,random,5,3,0.9486832980505137,0.02297740150320607
236
+ alpacav1,BLZ_240312,aggregate,holistic,kendall,random,5,4,0.7999999999999999,0.08333333333333333
237
+ alpacav1,BLZ_240312,aggregate,holistic,kendall,random,5,5,0.6,0.23333333333333334
238
+ alpacav1,BLZ_240312,aggregate,holistic,kendall,random,5,6,0.7999999999999999,0.08333333333333333
239
+ alpacav1,BLZ_240312,aggregate,holistic,kendall,random,5,7,0.39999999999999997,0.48333333333333334
240
+ alpacav1,BLZ_240312,aggregate,holistic,kendall,random,5,8,0.7999999999999999,0.08333333333333333
241
+ alpacav1,BLZ_240312,aggregate,holistic,kendall,random,5,9,0.7999999999999999,0.08333333333333333
242
+ alpacav2,BLZ_240312,aggregate,holistic,kendall,random,5,0,0.9999999999999999,0.016666666666666666
243
+ alpacav2,BLZ_240312,aggregate,holistic,kendall,random,5,1,0.7999999999999999,0.08333333333333333
244
+ alpacav2,BLZ_240312,aggregate,holistic,kendall,random,5,2,0.7999999999999999,0.08333333333333333
245
+ alpacav2,BLZ_240312,aggregate,holistic,kendall,random,5,3,0.7999999999999999,0.08333333333333333
246
+ alpacav2,BLZ_240312,aggregate,holistic,kendall,random,5,4,0.6,0.23333333333333334
247
+ alpacav2,BLZ_240312,aggregate,holistic,kendall,random,5,5,0.7999999999999999,0.08333333333333333
248
+ alpacav2,BLZ_240312,aggregate,holistic,kendall,random,5,6,0.19999999999999998,0.8166666666666667
249
+ alpacav2,BLZ_240312,aggregate,holistic,kendall,random,5,7,0.7999999999999999,0.08333333333333333
250
+ alpacav2,BLZ_240312,aggregate,holistic,kendall,random,5,8,0.9999999999999999,0.016666666666666666
251
+ alpacav2,BLZ_240312,aggregate,holistic,kendall,random,5,9,0.7999999999999999,0.08333333333333333
252
+ alpacaeval2_lc,BLZ_240312,aggregate,holistic,kendall,random,5,0,0.39999999999999997,0.48333333333333334
253
+ alpacaeval2_lc,BLZ_240312,aggregate,holistic,kendall,random,5,1,0.6,0.23333333333333334
254
+ alpacaeval2_lc,BLZ_240312,aggregate,holistic,kendall,random,5,2,0.9999999999999999,0.016666666666666666
255
+ alpacaeval2_lc,BLZ_240312,aggregate,holistic,kendall,random,5,3,0.6,0.23333333333333334
256
+ alpacaeval2_lc,BLZ_240312,aggregate,holistic,kendall,random,5,4,0.9999999999999999,0.016666666666666666
257
+ alpacaeval2_lc,BLZ_240312,aggregate,holistic,kendall,random,5,5,0.7999999999999999,0.08333333333333333
258
+ alpacaeval2_lc,BLZ_240312,aggregate,holistic,kendall,random,5,6,0.6,0.23333333333333334
259
+ alpacaeval2_lc,BLZ_240312,aggregate,holistic,kendall,random,5,7,0.7999999999999999,0.08333333333333333
260
+ alpacaeval2_lc,BLZ_240312,aggregate,holistic,kendall,random,5,8,0.9999999999999999,0.016666666666666666
261
+ alpacaeval2_lc,BLZ_240312,aggregate,holistic,kendall,random,5,9,0.7999999999999999,0.08333333333333333
262
+ arena_elo,BLZ_240312,aggregate,holistic,kendall,random,5,0,0.9486832980505137,0.02297740150320607
263
+ arena_elo,BLZ_240312,aggregate,holistic,kendall,random,5,1,0.6,0.23333333333333334
264
+ arena_elo,BLZ_240312,aggregate,holistic,kendall,random,5,2,0.7999999999999999,0.08333333333333333
265
+ arena_elo,BLZ_240312,aggregate,holistic,kendall,random,5,3,0.9999999999999999,0.016666666666666666
266
+ arena_elo,BLZ_240312,aggregate,holistic,kendall,random,5,4,0.7999999999999999,0.08333333333333333
267
+ arena_elo,BLZ_240312,aggregate,holistic,kendall,random,5,5,0.6,0.23333333333333334
268
+ arena_elo,BLZ_240312,aggregate,holistic,kendall,random,5,6,0.7999999999999999,0.08333333333333333
269
+ arena_elo,BLZ_240312,aggregate,holistic,kendall,random,5,7,0.6,0.23333333333333334
270
+ arena_elo,BLZ_240312,aggregate,holistic,kendall,random,5,8,0.7999999999999999,0.08333333333333333
271
+ arena_elo,BLZ_240312,aggregate,holistic,kendall,random,5,9,0.9999999999999999,0.016666666666666666
272
+ bbh,BLZ_240312,aggregate,holistic,kendall,random,5,0,0.6,0.23333333333333334
273
+ bbh,BLZ_240312,aggregate,holistic,kendall,random,5,1,0.9999999999999999,0.016666666666666666
274
+ bbh,BLZ_240312,aggregate,holistic,kendall,random,5,2,0.7999999999999999,0.08333333333333333
275
+ bbh,BLZ_240312,aggregate,holistic,kendall,random,5,3,0.9999999999999999,0.016666666666666666
276
+ bbh,BLZ_240312,aggregate,holistic,kendall,random,5,4,0.7999999999999999,0.08333333333333333
277
+ bbh,BLZ_240312,aggregate,holistic,kendall,random,5,5,0.7999999999999999,0.08333333333333333
278
+ bbh,BLZ_240312,aggregate,holistic,kendall,random,5,6,0.7999999999999999,0.08333333333333333
279
+ bbh,BLZ_240312,aggregate,holistic,kendall,random,5,7,0.9999999999999999,0.016666666666666666
280
+ bbh,BLZ_240312,aggregate,holistic,kendall,random,5,8,0.7999999999999999,0.08333333333333333
281
+ bbh,BLZ_240312,aggregate,holistic,kendall,random,5,9,0.7999999999999999,0.08333333333333333
282
+ eq_benchv2,BLZ_240312,aggregate,holistic,kendall,random,5,0,0.7999999999999999,0.08333333333333333
283
+ eq_benchv2,BLZ_240312,aggregate,holistic,kendall,random,5,1,0.9999999999999999,0.016666666666666666
284
+ eq_benchv2,BLZ_240312,aggregate,holistic,kendall,random,5,2,0.7999999999999999,0.08333333333333333
285
+ eq_benchv2,BLZ_240312,aggregate,holistic,kendall,random,5,3,0.9999999999999999,0.016666666666666666
286
+ eq_benchv2,BLZ_240312,aggregate,holistic,kendall,random,5,4,0.9999999999999999,0.016666666666666666
287
+ eq_benchv2,BLZ_240312,aggregate,holistic,kendall,random,5,5,0.6,0.23333333333333334
288
+ eq_benchv2,BLZ_240312,aggregate,holistic,kendall,random,5,6,0.7999999999999999,0.08333333333333333
289
+ eq_benchv2,BLZ_240312,aggregate,holistic,kendall,random,5,7,0.7999999999999999,0.08333333333333333
290
+ eq_benchv2,BLZ_240312,aggregate,holistic,kendall,random,5,8,0.7999999999999999,0.08333333333333333
291
+ eq_benchv2,BLZ_240312,aggregate,holistic,kendall,random,5,9,0.9999999999999999,0.016666666666666666
292
+ gpt4all,BLZ_240312,aggregate,holistic,kendall,random,5,0,0.7999999999999999,0.08333333333333333
293
+ gpt4all,BLZ_240312,aggregate,holistic,kendall,random,5,1,0.6,0.23333333333333334
294
+ gpt4all,BLZ_240312,aggregate,holistic,kendall,random,5,2,0.39999999999999997,0.48333333333333334
295
+ gpt4all,BLZ_240312,aggregate,holistic,kendall,random,5,3,0.7999999999999999,0.08333333333333333
296
+ gpt4all,BLZ_240312,aggregate,holistic,kendall,random,5,4,0.9999999999999999,0.016666666666666666
297
+ gpt4all,BLZ_240312,aggregate,holistic,kendall,random,5,5,0.9999999999999999,0.016666666666666666
298
+ gpt4all,BLZ_240312,aggregate,holistic,kendall,random,5,6,0.7999999999999999,0.08333333333333333
299
+ gpt4all,BLZ_240312,aggregate,holistic,kendall,random,5,7,0.6,0.23333333333333334
300
+ gpt4all,BLZ_240312,aggregate,holistic,kendall,random,5,8,0.39999999999999997,0.48333333333333334
301
+ gpt4all,BLZ_240312,aggregate,holistic,kendall,random,5,9,0.7999999999999999,0.08333333333333333
302
+ hugging_6,BLZ_240312,aggregate,holistic,kendall,random,5,0,0.7999999999999999,0.08333333333333333
303
+ hugging_6,BLZ_240312,aggregate,holistic,kendall,random,5,1,0.7999999999999999,0.08333333333333333
304
+ hugging_6,BLZ_240312,aggregate,holistic,kendall,random,5,2,0.7999999999999999,0.08333333333333333
305
+ hugging_6,BLZ_240312,aggregate,holistic,kendall,random,5,3,0.9999999999999999,0.016666666666666666
306
+ hugging_6,BLZ_240312,aggregate,holistic,kendall,random,5,4,0.7999999999999999,0.08333333333333333
307
+ hugging_6,BLZ_240312,aggregate,holistic,kendall,random,5,5,0.7999999999999999,0.08333333333333333
308
+ hugging_6,BLZ_240312,aggregate,holistic,kendall,random,5,6,0.6,0.23333333333333334
309
+ hugging_6,BLZ_240312,aggregate,holistic,kendall,random,5,7,0.19999999999999998,0.8166666666666667
310
+ hugging_6,BLZ_240312,aggregate,holistic,kendall,random,5,8,0.6,0.23333333333333334
311
+ hugging_6,BLZ_240312,aggregate,holistic,kendall,random,5,9,0.6,0.23333333333333334
312
+ llmonitor,BLZ_240312,aggregate,holistic,kendall,random,5,0,0.9999999999999999,0.016666666666666666
313
+ llmonitor,BLZ_240312,aggregate,holistic,kendall,random,5,1,0.9486832980505137,0.02297740150320607
314
+ llmonitor,BLZ_240312,aggregate,holistic,kendall,random,5,2,0.9999999999999999,0.016666666666666666
315
+ llmonitor,BLZ_240312,aggregate,holistic,kendall,random,5,3,0.7999999999999999,0.08333333333333333
316
+ llmonitor,BLZ_240312,aggregate,holistic,kendall,random,5,4,0.9486832980505137,0.02297740150320607
317
+ llmonitor,BLZ_240312,aggregate,holistic,kendall,random,5,5,0.7999999999999999,0.08333333333333333
318
+ llmonitor,BLZ_240312,aggregate,holistic,kendall,random,5,6,0.6,0.23333333333333334
319
+ llmonitor,BLZ_240312,aggregate,holistic,kendall,random,5,7,0.7999999999999999,0.08333333333333333
320
+ llmonitor,BLZ_240312,aggregate,holistic,kendall,random,5,8,0.7378647873726218,0.07697417298126676
321
+ llmonitor,BLZ_240312,aggregate,holistic,kendall,random,5,9,0.7999999999999999,0.08333333333333333
322
+ magi,BLZ_240312,aggregate,holistic,kendall,random,5,0,0.9999999999999999,0.016666666666666666
323
+ magi,BLZ_240312,aggregate,holistic,kendall,random,5,1,0.6,0.23333333333333334
324
+ magi,BLZ_240312,aggregate,holistic,kendall,random,5,2,0.6,0.23333333333333334
325
+ magi,BLZ_240312,aggregate,holistic,kendall,random,5,3,0.6,0.23333333333333334
326
+ magi,BLZ_240312,aggregate,holistic,kendall,random,5,4,0.9999999999999999,0.016666666666666666
327
+ magi,BLZ_240312,aggregate,holistic,kendall,random,5,5,0.9999999999999999,0.016666666666666666
328
+ magi,BLZ_240312,aggregate,holistic,kendall,random,5,6,0.7999999999999999,0.08333333333333333
329
+ magi,BLZ_240312,aggregate,holistic,kendall,random,5,7,0.7999999999999999,0.08333333333333333
330
+ magi,BLZ_240312,aggregate,holistic,kendall,random,5,8,0.39999999999999997,0.48333333333333334
331
+ magi,BLZ_240312,aggregate,holistic,kendall,random,5,9,0.6,0.23333333333333334
332
+ mmlu,BLZ_240312,aggregate,holistic,kendall,random,5,0,0.7999999999999999,0.08333333333333333
333
+ mmlu,BLZ_240312,aggregate,holistic,kendall,random,5,1,0.9999999999999999,0.016666666666666666
334
+ mmlu,BLZ_240312,aggregate,holistic,kendall,random,5,2,0.9999999999999999,0.016666666666666666
335
+ mmlu,BLZ_240312,aggregate,holistic,kendall,random,5,3,0.7999999999999999,0.08333333333333333
336
+ mmlu,BLZ_240312,aggregate,holistic,kendall,random,5,4,0.7999999999999999,0.08333333333333333
337
+ mmlu,BLZ_240312,aggregate,holistic,kendall,random,5,5,0.6,0.23333333333333334
338
+ mmlu,BLZ_240312,aggregate,holistic,kendall,random,5,6,0.9999999999999999,0.016666666666666666
339
+ mmlu,BLZ_240312,aggregate,holistic,kendall,random,5,7,0.6,0.23333333333333334
340
+ mmlu,BLZ_240312,aggregate,holistic,kendall,random,5,8,0.9999999999999999,0.016666666666666666
341
+ mmlu,BLZ_240312,aggregate,holistic,kendall,random,5,9,0.9999999999999999,0.016666666666666666
342
+ mt_bench,BLZ_240312,aggregate,holistic,kendall,random,5,0,0.7999999999999999,0.08333333333333333
343
+ mt_bench,BLZ_240312,aggregate,holistic,kendall,random,5,1,0.7999999999999999,0.08333333333333333
344
+ mt_bench,BLZ_240312,aggregate,holistic,kendall,random,5,2,0.7999999999999999,0.08333333333333333
345
+ mt_bench,BLZ_240312,aggregate,holistic,kendall,random,5,3,0.9999999999999999,0.016666666666666666
346
+ mt_bench,BLZ_240312,aggregate,holistic,kendall,random,5,4,0.9999999999999999,0.016666666666666666
347
+ mt_bench,BLZ_240312,aggregate,holistic,kendall,random,5,5,0.39999999999999997,0.48333333333333334
348
+ mt_bench,BLZ_240312,aggregate,holistic,kendall,random,5,6,0.9999999999999999,0.016666666666666666
349
+ mt_bench,BLZ_240312,aggregate,holistic,kendall,random,5,7,0.7999999999999999,0.08333333333333333
350
+ mt_bench,BLZ_240312,aggregate,holistic,kendall,random,5,8,0.7999999999999999,0.08333333333333333
351
+ mt_bench,BLZ_240312,aggregate,holistic,kendall,random,5,9,0.9999999999999999,0.016666666666666666
352
+ biggen_mwr,biggen_240612,aggregate,holistic,kendall,random,5,0,0.6,0.23333333333333334
353
+ biggen_mwr,biggen_240612,aggregate,holistic,kendall,random,5,1,0.9999999999999999,0.016666666666666666
354
+ biggen_mwr,biggen_240612,aggregate,holistic,kendall,random,5,2,0.7999999999999999,0.08333333333333333
355
+ biggen_mwr,biggen_240612,aggregate,holistic,kendall,random,5,3,0.6,0.23333333333333334
356
+ biggen_mwr,biggen_240612,aggregate,holistic,kendall,random,5,4,0.9999999999999999,0.016666666666666666
357
+ biggen_mwr,biggen_240612,aggregate,holistic,kendall,random,5,5,0.6,0.23333333333333334
358
+ biggen_mwr,biggen_240612,aggregate,holistic,kendall,random,5,6,0.7999999999999999,0.08333333333333333
359
+ biggen_mwr,biggen_240612,aggregate,holistic,kendall,random,5,7,0.9999999999999999,0.016666666666666666
360
+ biggen_mwr,biggen_240612,aggregate,holistic,kendall,random,5,8,0.7999999999999999,0.08333333333333333
361
+ biggen_mwr,biggen_240612,aggregate,holistic,kendall,random,5,9,0.7999999999999999,0.08333333333333333
362
+ aggregate,holistic,hellaswag,BLZ_240312,kendall,random,5,0,0.9999999999999999,0.016666666666666666
363
+ aggregate,holistic,hellaswag,BLZ_240312,kendall,random,5,1,0.39999999999999997,0.48333333333333334
364
+ aggregate,holistic,hellaswag,BLZ_240312,kendall,random,5,2,0.7999999999999999,0.08333333333333333
365
+ aggregate,holistic,hellaswag,BLZ_240312,kendall,random,5,3,0.39999999999999997,0.48333333333333334
366
+ aggregate,holistic,hellaswag,BLZ_240312,kendall,random,5,4,0.7999999999999999,0.08333333333333333
367
+ aggregate,holistic,hellaswag,BLZ_240312,kendall,random,5,5,0.6,0.23333333333333334
368
+ aggregate,holistic,hellaswag,BLZ_240312,kendall,random,5,6,0.39999999999999997,0.48333333333333334
369
+ aggregate,holistic,hellaswag,BLZ_240312,kendall,random,5,7,0.7999999999999999,0.08333333333333333
370
+ aggregate,holistic,hellaswag,BLZ_240312,kendall,random,5,8,0.9999999999999999,0.016666666666666666
371
+ aggregate,holistic,hellaswag,BLZ_240312,kendall,random,5,9,0.7999999999999999,0.08333333333333333
372
+ aggregate,holistic,humaneval,BLZ_240312,kendall,random,5,0,0.39999999999999997,0.48333333333333334
373
+ aggregate,holistic,humaneval,BLZ_240312,kendall,random,5,1,0.7999999999999999,0.08333333333333333
374
+ aggregate,holistic,humaneval,BLZ_240312,kendall,random,5,2,0.7999999999999999,0.08333333333333333
375
+ aggregate,holistic,humaneval,BLZ_240312,kendall,random,5,3,0.39999999999999997,0.48333333333333334
376
+ aggregate,holistic,humaneval,BLZ_240312,kendall,random,5,4,0.9999999999999999,0.016666666666666666
377
+ aggregate,holistic,humaneval,BLZ_240312,kendall,random,5,5,0.6,0.23333333333333334
378
+ aggregate,holistic,humaneval,BLZ_240312,kendall,random,5,6,0.6,0.23333333333333334
379
+ aggregate,holistic,humaneval,BLZ_240312,kendall,random,5,7,0.6,0.23333333333333334
380
+ aggregate,holistic,humaneval,BLZ_240312,kendall,random,5,8,0.6,0.23333333333333334
381
+ aggregate,holistic,humaneval,BLZ_240312,kendall,random,5,9,0.6,0.23333333333333334
382
+ aggregate,holistic,mbpp,BLZ_240312,kendall,random,5,0,0.6,0.23333333333333334
383
+ aggregate,holistic,mbpp,BLZ_240312,kendall,random,5,1,0.39999999999999997,0.48333333333333334
384
+ aggregate,holistic,mbpp,BLZ_240312,kendall,random,5,2,0.6,0.23333333333333334
385
+ aggregate,holistic,mbpp,BLZ_240312,kendall,random,5,3,0.10540925533894598,0.8005421074231263
386
+ aggregate,holistic,mbpp,BLZ_240312,kendall,random,5,4,0.19999999999999998,0.8166666666666667
387
+ aggregate,holistic,mbpp,BLZ_240312,kendall,random,5,5,0.39999999999999997,0.48333333333333334
388
+ aggregate,holistic,mbpp,BLZ_240312,kendall,random,5,6,0.10540925533894598,0.8005421074231263
389
+ aggregate,holistic,mbpp,BLZ_240312,kendall,random,5,7,0.10540925533894598,0.8005421074231263
390
+ aggregate,holistic,mbpp,BLZ_240312,kendall,random,5,8,-0.5270462766947298,0.206507295485425
391
+ aggregate,holistic,mbpp,BLZ_240312,kendall,random,5,9,-0.39999999999999997,0.48333333333333334
392
+ aggregate,holistic,winogrande,BLZ_240312,kendall,random,5,0,0.6,0.23333333333333334
393
+ aggregate,holistic,winogrande,BLZ_240312,kendall,random,5,1,0.7999999999999999,0.08333333333333333
394
+ aggregate,holistic,winogrande,BLZ_240312,kendall,random,5,2,0.7999999999999999,0.08333333333333333
395
+ aggregate,holistic,winogrande,BLZ_240312,kendall,random,5,3,0.9999999999999999,0.016666666666666666
396
+ aggregate,holistic,winogrande,BLZ_240312,kendall,random,5,4,0.7999999999999999,0.08333333333333333
397
+ aggregate,holistic,winogrande,BLZ_240312,kendall,random,5,5,0.7999999999999999,0.08333333333333333
398
+ aggregate,holistic,winogrande,BLZ_240312,kendall,random,5,6,0.39999999999999997,0.48333333333333334
399
+ aggregate,holistic,winogrande,BLZ_240312,kendall,random,5,7,0.39999999999999997,0.48333333333333334
400
+ aggregate,holistic,winogrande,BLZ_240312,kendall,random,5,8,0.7999999999999999,0.08333333333333333
401
+ aggregate,holistic,winogrande,BLZ_240312,kendall,random,5,9,0.9999999999999999,0.016666666666666666
402
+ aggregate,holistic,grounding,biggen_240612,kendall,random,5,0,0.9486832980505137,0.02297740150320607
403
+ aggregate,holistic,grounding,biggen_240612,kendall,random,5,1,0.9999999999999999,0.016666666666666666
404
+ aggregate,holistic,grounding,biggen_240612,kendall,random,5,2,0.6,0.23333333333333334
405
+ aggregate,holistic,grounding,biggen_240612,kendall,random,5,3,0.7999999999999999,0.08333333333333333
406
+ aggregate,holistic,grounding,biggen_240612,kendall,random,5,4,0.6,0.23333333333333334
407
+ aggregate,holistic,grounding,biggen_240612,kendall,random,5,5,0.7378647873726218,0.07697417298126676
408
+ aggregate,holistic,grounding,biggen_240612,kendall,random,5,6,0.6,0.23333333333333334
409
+ aggregate,holistic,grounding,biggen_240612,kendall,random,5,7,0.9999999999999999,0.016666666666666666
410
+ aggregate,holistic,grounding,biggen_240612,kendall,random,5,8,0.9999999999999999,0.016666666666666666
411
+ aggregate,holistic,grounding,biggen_240612,kendall,random,5,9,0.7999999999999999,0.08333333333333333
412
+ aggregate,holistic,instruction_following,biggen_240612,kendall,random,5,0,0.9486832980505137,0.02297740150320607
413
+ aggregate,holistic,instruction_following,biggen_240612,kendall,random,5,1,0.7999999999999999,0.08333333333333333
414
+ aggregate,holistic,instruction_following,biggen_240612,kendall,random,5,2,0.19999999999999998,0.8166666666666667
415
+ aggregate,holistic,instruction_following,biggen_240612,kendall,random,5,3,0.6,0.23333333333333334
416
+ aggregate,holistic,instruction_following,biggen_240612,kendall,random,5,4,0.9486832980505137,0.02297740150320607
417
+ aggregate,holistic,instruction_following,biggen_240612,kendall,random,5,5,0.7378647873726218,0.07697417298126676
418
+ aggregate,holistic,instruction_following,biggen_240612,kendall,random,5,6,0.9999999999999999,0.016666666666666666
419
+ aggregate,holistic,instruction_following,biggen_240612,kendall,random,5,7,0.7999999999999999,0.08333333333333333
420
+ aggregate,holistic,instruction_following,biggen_240612,kendall,random,5,8,0.7999999999999999,0.08333333333333333
421
+ aggregate,holistic,instruction_following,biggen_240612,kendall,random,5,9,0.9999999999999999,0.016666666666666666
422
+ aggregate,holistic,planning,biggen_240612,kendall,random,5,0,0.0,1.0
423
+ aggregate,holistic,planning,biggen_240612,kendall,random,5,1,0.7999999999999999,0.08333333333333333
424
+ aggregate,holistic,planning,biggen_240612,kendall,random,5,2,0.7999999999999999,0.08333333333333333
425
+ aggregate,holistic,planning,biggen_240612,kendall,random,5,3,0.9999999999999999,0.016666666666666666
426
+ aggregate,holistic,planning,biggen_240612,kendall,random,5,4,0.6,0.23333333333333334
427
+ aggregate,holistic,planning,biggen_240612,kendall,random,5,5,0.7999999999999999,0.08333333333333333
428
+ aggregate,holistic,planning,biggen_240612,kendall,random,5,6,0.6,0.23333333333333334
429
+ aggregate,holistic,planning,biggen_240612,kendall,random,5,7,0.7999999999999999,0.08333333333333333
430
+ aggregate,holistic,planning,biggen_240612,kendall,random,5,8,0.39999999999999997,0.48333333333333334
431
+ aggregate,holistic,planning,biggen_240612,kendall,random,5,9,0.9999999999999999,0.016666666666666666
432
+ aggregate,holistic,reasoning,biggen_240612,kendall,random,5,0,0.7999999999999999,0.08333333333333333
433
+ aggregate,holistic,reasoning,biggen_240612,kendall,random,5,1,0.7999999999999999,0.08333333333333333
434
+ aggregate,holistic,reasoning,biggen_240612,kendall,random,5,2,0.6,0.23333333333333334
435
+ aggregate,holistic,reasoning,biggen_240612,kendall,random,5,3,0.7999999999999999,0.08333333333333333
436
+ aggregate,holistic,reasoning,biggen_240612,kendall,random,5,4,0.39999999999999997,0.48333333333333334
437
+ aggregate,holistic,reasoning,biggen_240612,kendall,random,5,5,0.9999999999999999,0.016666666666666666
438
+ aggregate,holistic,reasoning,biggen_240612,kendall,random,5,6,0.7999999999999999,0.08333333333333333
439
+ aggregate,holistic,reasoning,biggen_240612,kendall,random,5,7,0.6,0.23333333333333334
440
+ aggregate,holistic,reasoning,biggen_240612,kendall,random,5,8,0.7999999999999999,0.08333333333333333
441
+ aggregate,holistic,reasoning,biggen_240612,kendall,random,5,9,0.6,0.23333333333333334
442
+ aggregate,holistic,refinement,biggen_240612,kendall,random,5,0,0.39999999999999997,0.48333333333333334
443
+ aggregate,holistic,refinement,biggen_240612,kendall,random,5,1,0.7999999999999999,0.08333333333333333
444
+ aggregate,holistic,refinement,biggen_240612,kendall,random,5,2,0.39999999999999997,0.48333333333333334
445
+ aggregate,holistic,refinement,biggen_240612,kendall,random,5,3,0.9999999999999999,0.016666666666666666
446
+ aggregate,holistic,refinement,biggen_240612,kendall,random,5,4,0.6,0.23333333333333334
447
+ aggregate,holistic,refinement,biggen_240612,kendall,random,5,5,0.7999999999999999,0.08333333333333333
448
+ aggregate,holistic,refinement,biggen_240612,kendall,random,5,6,0.7999999999999999,0.08333333333333333
449
+ aggregate,holistic,refinement,biggen_240612,kendall,random,5,7,0.7999999999999999,0.08333333333333333
450
+ aggregate,holistic,refinement,biggen_240612,kendall,random,5,8,0.7999999999999999,0.08333333333333333
451
+ aggregate,holistic,refinement,biggen_240612,kendall,random,5,9,0.9999999999999999,0.016666666666666666
452
+ aggregate,holistic,safety,biggen_240612,kendall,random,5,0,0.6,0.23333333333333334
453
+ aggregate,holistic,safety,biggen_240612,kendall,random,5,1,0.0,1.0
454
+ aggregate,holistic,safety,biggen_240612,kendall,random,5,2,-0.39999999999999997,0.48333333333333334
455
+ aggregate,holistic,safety,biggen_240612,kendall,random,5,3,0.6,0.23333333333333334
456
+ aggregate,holistic,safety,biggen_240612,kendall,random,5,4,-0.6,0.23333333333333334
457
+ aggregate,holistic,safety,biggen_240612,kendall,random,5,5,0.39999999999999997,0.48333333333333334
458
+ aggregate,holistic,safety,biggen_240612,kendall,random,5,6,-0.19999999999999998,0.8166666666666667
459
+ aggregate,holistic,safety,biggen_240612,kendall,random,5,7,-0.39999999999999997,0.48333333333333334
460
+ aggregate,holistic,safety,biggen_240612,kendall,random,5,8,-0.19999999999999998,0.8166666666666667
461
+ aggregate,holistic,safety,biggen_240612,kendall,random,5,9,0.0,1.0
462
+ aggregate,holistic,theory_of_mind,biggen_240612,kendall,random,5,0,0.7999999999999999,0.08333333333333333
463
+ aggregate,holistic,theory_of_mind,biggen_240612,kendall,random,5,1,0.6,0.23333333333333334
464
+ aggregate,holistic,theory_of_mind,biggen_240612,kendall,random,5,2,0.39999999999999997,0.48333333333333334
465
+ aggregate,holistic,theory_of_mind,biggen_240612,kendall,random,5,3,0.7999999999999999,0.08333333333333333
466
+ aggregate,holistic,theory_of_mind,biggen_240612,kendall,random,5,4,0.39999999999999997,0.48333333333333334
467
+ aggregate,holistic,theory_of_mind,biggen_240612,kendall,random,5,5,0.7999999999999999,0.08333333333333333
468
+ aggregate,holistic,theory_of_mind,biggen_240612,kendall,random,5,6,0.9999999999999999,0.016666666666666666
469
+ aggregate,holistic,theory_of_mind,biggen_240612,kendall,random,5,7,0.7999999999999999,0.08333333333333333
470
+ aggregate,holistic,theory_of_mind,biggen_240612,kendall,random,5,8,0.7999999999999999,0.08333333333333333
471
+ aggregate,holistic,theory_of_mind,biggen_240612,kendall,random,5,9,0.9999999999999999,0.016666666666666666
472
+ aggregate,holistic,tool_usage,biggen_240612,kendall,random,5,0,0.19999999999999998,0.8166666666666667
473
+ aggregate,holistic,tool_usage,biggen_240612,kendall,random,5,1,0.9999999999999999,0.016666666666666666
474
+ aggregate,holistic,tool_usage,biggen_240612,kendall,random,5,2,0.7999999999999999,0.08333333333333333
475
+ aggregate,holistic,tool_usage,biggen_240612,kendall,random,5,3,0.6,0.23333333333333334
476
+ aggregate,holistic,tool_usage,biggen_240612,kendall,random,5,4,0.9999999999999999,0.016666666666666666
477
+ aggregate,holistic,tool_usage,biggen_240612,kendall,random,5,5,0.6,0.23333333333333334
478
+ aggregate,holistic,tool_usage,biggen_240612,kendall,random,5,6,0.7999999999999999,0.08333333333333333
479
+ aggregate,holistic,tool_usage,biggen_240612,kendall,random,5,7,0.6,0.23333333333333334
480
+ aggregate,holistic,tool_usage,biggen_240612,kendall,random,5,8,0.7999999999999999,0.08333333333333333
481
+ aggregate,holistic,tool_usage,biggen_240612,kendall,random,5,9,0.7999999999999999,0.08333333333333333
482
+ aggregate,holistic,livebench_average,livebench_240701,kendall,random,5,0,0.6,0.23333333333333334
483
+ aggregate,holistic,livebench_average,livebench_240701,kendall,random,5,1,0.7999999999999999,0.08333333333333333
484
+ aggregate,holistic,livebench_average,livebench_240701,kendall,random,5,2,0.6,0.23333333333333334
485
+ aggregate,holistic,livebench_average,livebench_240701,kendall,random,5,3,0.39999999999999997,0.48333333333333334
486
+ aggregate,holistic,livebench_average,livebench_240701,kendall,random,5,4,0.6,0.23333333333333334
487
+ aggregate,holistic,livebench_average,livebench_240701,kendall,random,5,5,0.7999999999999999,0.08333333333333333
488
+ aggregate,holistic,livebench_average,livebench_240701,kendall,random,5,6,0.6,0.23333333333333334
489
+ aggregate,holistic,livebench_average,livebench_240701,kendall,random,5,7,0.7999999999999999,0.08333333333333333
490
+ aggregate,holistic,livebench_average,livebench_240701,kendall,random,5,8,0.6,0.23333333333333334
491
+ aggregate,holistic,livebench_average,livebench_240701,kendall,random,5,9,0.7999999999999999,0.08333333333333333
492
+ aggregate,holistic,reasoning_average,livebench_240701,kendall,random,5,0,0.7378647873726218,0.07697417298126676
493
+ aggregate,holistic,reasoning_average,livebench_240701,kendall,random,5,1,0.7999999999999999,0.08333333333333333
494
+ aggregate,holistic,reasoning_average,livebench_240701,kendall,random,5,2,0.7378647873726218,0.07697417298126676
495
+ aggregate,holistic,reasoning_average,livebench_240701,kendall,random,5,3,0.5270462766947298,0.206507295485425
496
+ aggregate,holistic,reasoning_average,livebench_240701,kendall,random,5,4,0.7999999999999999,0.08333333333333333
497
+ aggregate,holistic,reasoning_average,livebench_240701,kendall,random,5,5,0.7999999999999999,0.08333333333333333
498
+ aggregate,holistic,reasoning_average,livebench_240701,kendall,random,5,6,0.7999999999999999,0.08333333333333333
499
+ aggregate,holistic,reasoning_average,livebench_240701,kendall,random,5,7,0.7999999999999999,0.08333333333333333
500
+ aggregate,holistic,reasoning_average,livebench_240701,kendall,random,5,8,0.7999999999999999,0.08333333333333333
501
+ aggregate,holistic,reasoning_average,livebench_240701,kendall,random,5,9,0.7999999999999999,0.08333333333333333
502
+ aggregate,holistic,coding_average,livebench_240701,kendall,random,5,0,0.5270462766947298,0.206507295485425
503
+ aggregate,holistic,coding_average,livebench_240701,kendall,random,5,1,0.7378647873726218,0.07697417298126676
504
+ aggregate,holistic,coding_average,livebench_240701,kendall,random,5,2,0.5270462766947298,0.206507295485425
505
+ aggregate,holistic,coding_average,livebench_240701,kendall,random,5,3,0.31622776601683794,0.44848886103153174
506
+ aggregate,holistic,coding_average,livebench_240701,kendall,random,5,4,0.6,0.23333333333333334
507
+ aggregate,holistic,coding_average,livebench_240701,kendall,random,5,5,0.7378647873726218,0.07697417298126676
508
+ aggregate,holistic,coding_average,livebench_240701,kendall,random,5,6,0.6,0.23333333333333334
509
+ aggregate,holistic,coding_average,livebench_240701,kendall,random,5,7,0.7999999999999999,0.08333333333333333
510
+ aggregate,holistic,coding_average,livebench_240701,kendall,random,5,8,0.6,0.23333333333333334
511
+ aggregate,holistic,coding_average,livebench_240701,kendall,random,5,9,0.7999999999999999,0.08333333333333333
512
+ aggregate,holistic,mathematics_average,livebench_240701,kendall,random,5,0,0.6,0.23333333333333334
513
+ aggregate,holistic,mathematics_average,livebench_240701,kendall,random,5,1,0.9999999999999999,0.016666666666666666
514
+ aggregate,holistic,mathematics_average,livebench_240701,kendall,random,5,2,0.7999999999999999,0.08333333333333333
515
+ aggregate,holistic,mathematics_average,livebench_240701,kendall,random,5,3,0.6,0.23333333333333334
516
+ aggregate,holistic,mathematics_average,livebench_240701,kendall,random,5,4,0.6,0.23333333333333334
517
+ aggregate,holistic,mathematics_average,livebench_240701,kendall,random,5,5,0.9999999999999999,0.016666666666666666
518
+ aggregate,holistic,mathematics_average,livebench_240701,kendall,random,5,6,0.6,0.23333333333333334
519
+ aggregate,holistic,mathematics_average,livebench_240701,kendall,random,5,7,0.7999999999999999,0.08333333333333333
520
+ aggregate,holistic,mathematics_average,livebench_240701,kendall,random,5,8,0.6,0.23333333333333334
521
+ aggregate,holistic,mathematics_average,livebench_240701,kendall,random,5,9,0.9999999999999999,0.016666666666666666
522
+ aggregate,holistic,data_analysis_average,livebench_240701,kendall,random,5,0,0.7999999999999999,0.08333333333333333
523
+ aggregate,holistic,data_analysis_average,livebench_240701,kendall,random,5,1,0.7999999999999999,0.08333333333333333
524
+ aggregate,holistic,data_analysis_average,livebench_240701,kendall,random,5,2,0.7999999999999999,0.08333333333333333
525
+ aggregate,holistic,data_analysis_average,livebench_240701,kendall,random,5,3,0.6,0.23333333333333334
526
+ aggregate,holistic,data_analysis_average,livebench_240701,kendall,random,5,4,0.7999999999999999,0.08333333333333333
527
+ aggregate,holistic,data_analysis_average,livebench_240701,kendall,random,5,5,0.7999999999999999,0.08333333333333333
528
+ aggregate,holistic,data_analysis_average,livebench_240701,kendall,random,5,6,0.7999999999999999,0.08333333333333333
529
+ aggregate,holistic,data_analysis_average,livebench_240701,kendall,random,5,7,0.7999999999999999,0.08333333333333333
530
+ aggregate,holistic,data_analysis_average,livebench_240701,kendall,random,5,8,0.7999999999999999,0.08333333333333333
531
+ aggregate,holistic,data_analysis_average,livebench_240701,kendall,random,5,9,0.7999999999999999,0.08333333333333333
532
+ aggregate,holistic,language_average,livebench_240701,kendall,random,5,0,0.7999999999999999,0.08333333333333333
533
+ aggregate,holistic,language_average,livebench_240701,kendall,random,5,1,0.6,0.23333333333333334
534
+ aggregate,holistic,language_average,livebench_240701,kendall,random,5,2,0.6,0.23333333333333334
535
+ aggregate,holistic,language_average,livebench_240701,kendall,random,5,3,0.6,0.23333333333333334
536
+ aggregate,holistic,language_average,livebench_240701,kendall,random,5,4,0.7999999999999999,0.08333333333333333
537
+ aggregate,holistic,language_average,livebench_240701,kendall,random,5,5,0.6,0.23333333333333334
538
+ aggregate,holistic,language_average,livebench_240701,kendall,random,5,6,0.6,0.23333333333333334
539
+ aggregate,holistic,language_average,livebench_240701,kendall,random,5,7,0.6,0.23333333333333334
540
+ aggregate,holistic,language_average,livebench_240701,kendall,random,5,8,0.7999999999999999,0.08333333333333333
541
+ aggregate,holistic,language_average,livebench_240701,kendall,random,5,9,0.6,0.23333333333333334
542
+ aggregate,holistic,if_average,livebench_240701,kendall,random,5,0,-0.39999999999999997,0.48333333333333334
543
+ aggregate,holistic,if_average,livebench_240701,kendall,random,5,1,0.19999999999999998,0.8166666666666667
544
+ aggregate,holistic,if_average,livebench_240701,kendall,random,5,2,-0.19999999999999998,0.8166666666666667
545
+ aggregate,holistic,if_average,livebench_240701,kendall,random,5,3,-0.6,0.23333333333333334
546
+ aggregate,holistic,if_average,livebench_240701,kendall,random,5,4,0.19999999999999998,0.8166666666666667
547
+ aggregate,holistic,if_average,livebench_240701,kendall,random,5,5,0.19999999999999998,0.8166666666666667
548
+ aggregate,holistic,if_average,livebench_240701,kendall,random,5,6,0.0,1.0
549
+ aggregate,holistic,if_average,livebench_240701,kendall,random,5,7,0.39999999999999997,0.48333333333333334
550
+ aggregate,holistic,if_average,livebench_240701,kendall,random,5,8,0.19999999999999998,0.8166666666666667
551
+ aggregate,holistic,if_average,livebench_240701,kendall,random,5,9,0.6,0.23333333333333334
552
+ aggregate,holistic,arena_hard,arena_hard_2404,kendall,random,5,0,0.39999999999999997,0.48333333333333334
553
+ aggregate,holistic,arena_hard,arena_hard_2404,kendall,random,5,1,0.7999999999999999,0.08333333333333333
554
+ aggregate,holistic,arena_hard,arena_hard_2404,kendall,random,5,2,0.9999999999999999,0.016666666666666666
555
+ aggregate,holistic,arena_hard,arena_hard_2404,kendall,random,5,3,0.7999999999999999,0.08333333333333333
556
+ aggregate,holistic,arena_hard,arena_hard_2404,kendall,random,5,4,0.7999999999999999,0.08333333333333333
557
+ aggregate,holistic,arena_hard,arena_hard_2404,kendall,random,5,5,0.39999999999999997,0.48333333333333334
558
+ aggregate,holistic,arena_hard,arena_hard_2404,kendall,random,5,6,0.7999999999999999,0.08333333333333333
559
+ aggregate,holistic,arena_hard,arena_hard_2404,kendall,random,5,7,0.9999999999999999,0.016666666666666666
560
+ aggregate,holistic,arena_hard,arena_hard_2404,kendall,random,5,8,0.9999999999999999,0.016666666666666666
561
+ aggregate,holistic,arena_hard,arena_hard_2404,kendall,random,5,9,0.6,0.23333333333333334
562
+ aggregate,holistic,mixeval,mixeval_240601,kendall,random,5,0,0.7999999999999999,0.08333333333333333
563
+ aggregate,holistic,mixeval,mixeval_240601,kendall,random,5,1,0.9999999999999999,0.016666666666666666
564
+ aggregate,holistic,mixeval,mixeval_240601,kendall,random,5,2,0.7999999999999999,0.08333333333333333
565
+ aggregate,holistic,mixeval,mixeval_240601,kendall,random,5,3,0.7999999999999999,0.08333333333333333
566
+ aggregate,holistic,mixeval,mixeval_240601,kendall,random,5,4,0.7999999999999999,0.08333333333333333
567
+ aggregate,holistic,mixeval,mixeval_240601,kendall,random,5,5,0.9999999999999999,0.016666666666666666
568
+ aggregate,holistic,mixeval,mixeval_240601,kendall,random,5,6,0.9999999999999999,0.016666666666666666
569
+ aggregate,holistic,mixeval,mixeval_240601,kendall,random,5,7,0.7999999999999999,0.08333333333333333
570
+ aggregate,holistic,mixeval,mixeval_240601,kendall,random,5,8,0.7999999999999999,0.08333333333333333
571
+ aggregate,holistic,mixeval,mixeval_240601,kendall,random,5,9,0.7999999999999999,0.08333333333333333
572
+ aggregate,holistic,agieval,BLZ_240312,kendall,random,5,0,0.9999999999999999,0.016666666666666666
573
+ aggregate,holistic,agieval,BLZ_240312,kendall,random,5,1,0.9999999999999999,0.016666666666666666
574
+ aggregate,holistic,agieval,BLZ_240312,kendall,random,5,2,0.9999999999999999,0.016666666666666666
575
+ aggregate,holistic,agieval,BLZ_240312,kendall,random,5,3,0.9999999999999999,0.016666666666666666
576
+ aggregate,holistic,agieval,BLZ_240312,kendall,random,5,4,0.9999999999999999,0.016666666666666666
577
+ aggregate,holistic,agieval,BLZ_240312,kendall,random,5,5,0.7999999999999999,0.08333333333333333
578
+ aggregate,holistic,agieval,BLZ_240312,kendall,random,5,6,0.9999999999999999,0.016666666666666666
579
+ aggregate,holistic,agieval,BLZ_240312,kendall,random,5,7,0.7999999999999999,0.08333333333333333
580
+ aggregate,holistic,agieval,BLZ_240312,kendall,random,5,8,0.9999999999999999,0.016666666666666666
581
+ aggregate,holistic,agieval,BLZ_240312,kendall,random,5,9,0.9999999999999999,0.016666666666666666
582
+ aggregate,holistic,arc_c,BLZ_240312,kendall,random,5,0,0.9486832980505137,0.02297740150320607
583
+ aggregate,holistic,arc_c,BLZ_240312,kendall,random,5,1,0.9999999999999999,0.016666666666666666
584
+ aggregate,holistic,arc_c,BLZ_240312,kendall,random,5,2,0.9999999999999999,0.016666666666666666
585
+ aggregate,holistic,arc_c,BLZ_240312,kendall,random,5,3,0.7999999999999999,0.08333333333333333
586
+ aggregate,holistic,arc_c,BLZ_240312,kendall,random,5,4,0.9999999999999999,0.016666666666666666
587
+ aggregate,holistic,arc_c,BLZ_240312,kendall,random,5,5,0.6,0.23333333333333334
588
+ aggregate,holistic,arc_c,BLZ_240312,kendall,random,5,6,0.7999999999999999,0.08333333333333333
589
+ aggregate,holistic,arc_c,BLZ_240312,kendall,random,5,7,0.9999999999999999,0.016666666666666666
590
+ aggregate,holistic,arc_c,BLZ_240312,kendall,random,5,8,0.9999999999999999,0.016666666666666666
591
+ aggregate,holistic,arc_c,BLZ_240312,kendall,random,5,9,0.9999999999999999,0.016666666666666666
592
+ aggregate,holistic,alpacav1,BLZ_240312,kendall,random,5,0,0.7999999999999999,0.08333333333333333
593
+ aggregate,holistic,alpacav1,BLZ_240312,kendall,random,5,1,0.9999999999999999,0.016666666666666666
594
+ aggregate,holistic,alpacav1,BLZ_240312,kendall,random,5,2,0.39999999999999997,0.48333333333333334
595
+ aggregate,holistic,alpacav1,BLZ_240312,kendall,random,5,3,0.9486832980505137,0.02297740150320607
596
+ aggregate,holistic,alpacav1,BLZ_240312,kendall,random,5,4,0.7999999999999999,0.08333333333333333
597
+ aggregate,holistic,alpacav1,BLZ_240312,kendall,random,5,5,0.6,0.23333333333333334
598
+ aggregate,holistic,alpacav1,BLZ_240312,kendall,random,5,6,0.7999999999999999,0.08333333333333333
599
+ aggregate,holistic,alpacav1,BLZ_240312,kendall,random,5,7,0.39999999999999997,0.48333333333333334
600
+ aggregate,holistic,alpacav1,BLZ_240312,kendall,random,5,8,0.7999999999999999,0.08333333333333333
601
+ aggregate,holistic,alpacav1,BLZ_240312,kendall,random,5,9,0.7999999999999999,0.08333333333333333
602
+ aggregate,holistic,alpacav2,BLZ_240312,kendall,random,5,0,0.9999999999999999,0.016666666666666666
603
+ aggregate,holistic,alpacav2,BLZ_240312,kendall,random,5,1,0.7999999999999999,0.08333333333333333
604
+ aggregate,holistic,alpacav2,BLZ_240312,kendall,random,5,2,0.7999999999999999,0.08333333333333333
605
+ aggregate,holistic,alpacav2,BLZ_240312,kendall,random,5,3,0.7999999999999999,0.08333333333333333
606
+ aggregate,holistic,alpacav2,BLZ_240312,kendall,random,5,4,0.6,0.23333333333333334
607
+ aggregate,holistic,alpacav2,BLZ_240312,kendall,random,5,5,0.7999999999999999,0.08333333333333333
608
+ aggregate,holistic,alpacav2,BLZ_240312,kendall,random,5,6,0.19999999999999998,0.8166666666666667
609
+ aggregate,holistic,alpacav2,BLZ_240312,kendall,random,5,7,0.7999999999999999,0.08333333333333333
610
+ aggregate,holistic,alpacav2,BLZ_240312,kendall,random,5,8,0.9999999999999999,0.016666666666666666
611
+ aggregate,holistic,alpacav2,BLZ_240312,kendall,random,5,9,0.7999999999999999,0.08333333333333333
612
+ aggregate,holistic,alpacaeval2_lc,BLZ_240312,kendall,random,5,0,0.39999999999999997,0.48333333333333334
613
+ aggregate,holistic,alpacaeval2_lc,BLZ_240312,kendall,random,5,1,0.6,0.23333333333333334
614
+ aggregate,holistic,alpacaeval2_lc,BLZ_240312,kendall,random,5,2,0.9999999999999999,0.016666666666666666
615
+ aggregate,holistic,alpacaeval2_lc,BLZ_240312,kendall,random,5,3,0.6,0.23333333333333334
616
+ aggregate,holistic,alpacaeval2_lc,BLZ_240312,kendall,random,5,4,0.9999999999999999,0.016666666666666666
617
+ aggregate,holistic,alpacaeval2_lc,BLZ_240312,kendall,random,5,5,0.7999999999999999,0.08333333333333333
618
+ aggregate,holistic,alpacaeval2_lc,BLZ_240312,kendall,random,5,6,0.6,0.23333333333333334
619
+ aggregate,holistic,alpacaeval2_lc,BLZ_240312,kendall,random,5,7,0.7999999999999999,0.08333333333333333
620
+ aggregate,holistic,alpacaeval2_lc,BLZ_240312,kendall,random,5,8,0.9999999999999999,0.016666666666666666
621
+ aggregate,holistic,alpacaeval2_lc,BLZ_240312,kendall,random,5,9,0.7999999999999999,0.08333333333333333
622
+ aggregate,holistic,arena_elo,BLZ_240312,kendall,random,5,0,0.9486832980505137,0.02297740150320607
623
+ aggregate,holistic,arena_elo,BLZ_240312,kendall,random,5,1,0.6,0.23333333333333334
624
+ aggregate,holistic,arena_elo,BLZ_240312,kendall,random,5,2,0.7999999999999999,0.08333333333333333
625
+ aggregate,holistic,arena_elo,BLZ_240312,kendall,random,5,3,0.9999999999999999,0.016666666666666666
626
+ aggregate,holistic,arena_elo,BLZ_240312,kendall,random,5,4,0.7999999999999999,0.08333333333333333
627
+ aggregate,holistic,arena_elo,BLZ_240312,kendall,random,5,5,0.6,0.23333333333333334
628
+ aggregate,holistic,arena_elo,BLZ_240312,kendall,random,5,6,0.7999999999999999,0.08333333333333333
629
+ aggregate,holistic,arena_elo,BLZ_240312,kendall,random,5,7,0.6,0.23333333333333334
630
+ aggregate,holistic,arena_elo,BLZ_240312,kendall,random,5,8,0.7999999999999999,0.08333333333333333
631
+ aggregate,holistic,arena_elo,BLZ_240312,kendall,random,5,9,0.9999999999999999,0.016666666666666666
632
+ aggregate,holistic,bbh,BLZ_240312,kendall,random,5,0,0.6,0.23333333333333334
633
+ aggregate,holistic,bbh,BLZ_240312,kendall,random,5,1,0.9999999999999999,0.016666666666666666
634
+ aggregate,holistic,bbh,BLZ_240312,kendall,random,5,2,0.7999999999999999,0.08333333333333333
635
+ aggregate,holistic,bbh,BLZ_240312,kendall,random,5,3,0.9999999999999999,0.016666666666666666
636
+ aggregate,holistic,bbh,BLZ_240312,kendall,random,5,4,0.7999999999999999,0.08333333333333333
637
+ aggregate,holistic,bbh,BLZ_240312,kendall,random,5,5,0.7999999999999999,0.08333333333333333
638
+ aggregate,holistic,bbh,BLZ_240312,kendall,random,5,6,0.7999999999999999,0.08333333333333333
639
+ aggregate,holistic,bbh,BLZ_240312,kendall,random,5,7,0.9999999999999999,0.016666666666666666
640
+ aggregate,holistic,bbh,BLZ_240312,kendall,random,5,8,0.7999999999999999,0.08333333333333333
641
+ aggregate,holistic,bbh,BLZ_240312,kendall,random,5,9,0.7999999999999999,0.08333333333333333
642
+ aggregate,holistic,eq_benchv2,BLZ_240312,kendall,random,5,0,0.7999999999999999,0.08333333333333333
643
+ aggregate,holistic,eq_benchv2,BLZ_240312,kendall,random,5,1,0.9999999999999999,0.016666666666666666
644
+ aggregate,holistic,eq_benchv2,BLZ_240312,kendall,random,5,2,0.7999999999999999,0.08333333333333333
645
+ aggregate,holistic,eq_benchv2,BLZ_240312,kendall,random,5,3,0.9999999999999999,0.016666666666666666
646
+ aggregate,holistic,eq_benchv2,BLZ_240312,kendall,random,5,4,0.9999999999999999,0.016666666666666666
647
+ aggregate,holistic,eq_benchv2,BLZ_240312,kendall,random,5,5,0.6,0.23333333333333334
648
+ aggregate,holistic,eq_benchv2,BLZ_240312,kendall,random,5,6,0.7999999999999999,0.08333333333333333
649
+ aggregate,holistic,eq_benchv2,BLZ_240312,kendall,random,5,7,0.7999999999999999,0.08333333333333333
650
+ aggregate,holistic,eq_benchv2,BLZ_240312,kendall,random,5,8,0.7999999999999999,0.08333333333333333
651
+ aggregate,holistic,eq_benchv2,BLZ_240312,kendall,random,5,9,0.9999999999999999,0.016666666666666666
652
+ aggregate,holistic,gpt4all,BLZ_240312,kendall,random,5,0,0.7999999999999999,0.08333333333333333
653
+ aggregate,holistic,gpt4all,BLZ_240312,kendall,random,5,1,0.6,0.23333333333333334
654
+ aggregate,holistic,gpt4all,BLZ_240312,kendall,random,5,2,0.39999999999999997,0.48333333333333334
655
+ aggregate,holistic,gpt4all,BLZ_240312,kendall,random,5,3,0.7999999999999999,0.08333333333333333
656
+ aggregate,holistic,gpt4all,BLZ_240312,kendall,random,5,4,0.9999999999999999,0.016666666666666666
657
+ aggregate,holistic,gpt4all,BLZ_240312,kendall,random,5,5,0.9999999999999999,0.016666666666666666
658
+ aggregate,holistic,gpt4all,BLZ_240312,kendall,random,5,6,0.7999999999999999,0.08333333333333333
659
+ aggregate,holistic,gpt4all,BLZ_240312,kendall,random,5,7,0.6,0.23333333333333334
660
+ aggregate,holistic,gpt4all,BLZ_240312,kendall,random,5,8,0.39999999999999997,0.48333333333333334
661
+ aggregate,holistic,gpt4all,BLZ_240312,kendall,random,5,9,0.7999999999999999,0.08333333333333333
662
+ aggregate,holistic,hugging_6,BLZ_240312,kendall,random,5,0,0.7999999999999999,0.08333333333333333
663
+ aggregate,holistic,hugging_6,BLZ_240312,kendall,random,5,1,0.7999999999999999,0.08333333333333333
664
+ aggregate,holistic,hugging_6,BLZ_240312,kendall,random,5,2,0.7999999999999999,0.08333333333333333
665
+ aggregate,holistic,hugging_6,BLZ_240312,kendall,random,5,3,0.9999999999999999,0.016666666666666666
666
+ aggregate,holistic,hugging_6,BLZ_240312,kendall,random,5,4,0.7999999999999999,0.08333333333333333
667
+ aggregate,holistic,hugging_6,BLZ_240312,kendall,random,5,5,0.7999999999999999,0.08333333333333333
668
+ aggregate,holistic,hugging_6,BLZ_240312,kendall,random,5,6,0.6,0.23333333333333334
669
+ aggregate,holistic,hugging_6,BLZ_240312,kendall,random,5,7,0.19999999999999998,0.8166666666666667
670
+ aggregate,holistic,hugging_6,BLZ_240312,kendall,random,5,8,0.6,0.23333333333333334
671
+ aggregate,holistic,hugging_6,BLZ_240312,kendall,random,5,9,0.6,0.23333333333333334
672
+ aggregate,holistic,llmonitor,BLZ_240312,kendall,random,5,0,0.9999999999999999,0.016666666666666666
673
+ aggregate,holistic,llmonitor,BLZ_240312,kendall,random,5,1,0.9486832980505137,0.02297740150320607
674
+ aggregate,holistic,llmonitor,BLZ_240312,kendall,random,5,2,0.9999999999999999,0.016666666666666666
675
+ aggregate,holistic,llmonitor,BLZ_240312,kendall,random,5,3,0.7999999999999999,0.08333333333333333
676
+ aggregate,holistic,llmonitor,BLZ_240312,kendall,random,5,4,0.9486832980505137,0.02297740150320607
677
+ aggregate,holistic,llmonitor,BLZ_240312,kendall,random,5,5,0.7999999999999999,0.08333333333333333
678
+ aggregate,holistic,llmonitor,BLZ_240312,kendall,random,5,6,0.6,0.23333333333333334
679
+ aggregate,holistic,llmonitor,BLZ_240312,kendall,random,5,7,0.7999999999999999,0.08333333333333333
680
+ aggregate,holistic,llmonitor,BLZ_240312,kendall,random,5,8,0.7378647873726218,0.07697417298126676
681
+ aggregate,holistic,llmonitor,BLZ_240312,kendall,random,5,9,0.7999999999999999,0.08333333333333333
682
+ aggregate,holistic,magi,BLZ_240312,kendall,random,5,0,0.9999999999999999,0.016666666666666666
683
+ aggregate,holistic,magi,BLZ_240312,kendall,random,5,1,0.6,0.23333333333333334
684
+ aggregate,holistic,magi,BLZ_240312,kendall,random,5,2,0.6,0.23333333333333334
685
+ aggregate,holistic,magi,BLZ_240312,kendall,random,5,3,0.6,0.23333333333333334
686
+ aggregate,holistic,magi,BLZ_240312,kendall,random,5,4,0.9999999999999999,0.016666666666666666
687
+ aggregate,holistic,magi,BLZ_240312,kendall,random,5,5,0.9999999999999999,0.016666666666666666
688
+ aggregate,holistic,magi,BLZ_240312,kendall,random,5,6,0.7999999999999999,0.08333333333333333
689
+ aggregate,holistic,magi,BLZ_240312,kendall,random,5,7,0.7999999999999999,0.08333333333333333
690
+ aggregate,holistic,magi,BLZ_240312,kendall,random,5,8,0.39999999999999997,0.48333333333333334
691
+ aggregate,holistic,magi,BLZ_240312,kendall,random,5,9,0.6,0.23333333333333334
692
+ aggregate,holistic,mmlu,BLZ_240312,kendall,random,5,0,0.7999999999999999,0.08333333333333333
693
+ aggregate,holistic,mmlu,BLZ_240312,kendall,random,5,1,0.9999999999999999,0.016666666666666666
694
+ aggregate,holistic,mmlu,BLZ_240312,kendall,random,5,2,0.9999999999999999,0.016666666666666666
695
+ aggregate,holistic,mmlu,BLZ_240312,kendall,random,5,3,0.7999999999999999,0.08333333333333333
696
+ aggregate,holistic,mmlu,BLZ_240312,kendall,random,5,4,0.7999999999999999,0.08333333333333333
697
+ aggregate,holistic,mmlu,BLZ_240312,kendall,random,5,5,0.6,0.23333333333333334
698
+ aggregate,holistic,mmlu,BLZ_240312,kendall,random,5,6,0.9999999999999999,0.016666666666666666
699
+ aggregate,holistic,mmlu,BLZ_240312,kendall,random,5,7,0.6,0.23333333333333334
700
+ aggregate,holistic,mmlu,BLZ_240312,kendall,random,5,8,0.9999999999999999,0.016666666666666666
701
+ aggregate,holistic,mmlu,BLZ_240312,kendall,random,5,9,0.9999999999999999,0.016666666666666666
702
+ aggregate,holistic,mt_bench,BLZ_240312,kendall,random,5,0,0.7999999999999999,0.08333333333333333
703
+ aggregate,holistic,mt_bench,BLZ_240312,kendall,random,5,1,0.7999999999999999,0.08333333333333333
704
+ aggregate,holistic,mt_bench,BLZ_240312,kendall,random,5,2,0.7999999999999999,0.08333333333333333
705
+ aggregate,holistic,mt_bench,BLZ_240312,kendall,random,5,3,0.9999999999999999,0.016666666666666666
706
+ aggregate,holistic,mt_bench,BLZ_240312,kendall,random,5,4,0.9999999999999999,0.016666666666666666
707
+ aggregate,holistic,mt_bench,BLZ_240312,kendall,random,5,5,0.39999999999999997,0.48333333333333334
708
+ aggregate,holistic,mt_bench,BLZ_240312,kendall,random,5,6,0.9999999999999999,0.016666666666666666
709
+ aggregate,holistic,mt_bench,BLZ_240312,kendall,random,5,7,0.7999999999999999,0.08333333333333333
710
+ aggregate,holistic,mt_bench,BLZ_240312,kendall,random,5,8,0.7999999999999999,0.08333333333333333
711
+ aggregate,holistic,mt_bench,BLZ_240312,kendall,random,5,9,0.9999999999999999,0.016666666666666666
712
+ aggregate,holistic,biggen_mwr,biggen_240612,kendall,random,5,0,0.6,0.23333333333333334
713
+ aggregate,holistic,biggen_mwr,biggen_240612,kendall,random,5,1,0.9999999999999999,0.016666666666666666
714
+ aggregate,holistic,biggen_mwr,biggen_240612,kendall,random,5,2,0.7999999999999999,0.08333333333333333
715
+ aggregate,holistic,biggen_mwr,biggen_240612,kendall,random,5,3,0.6,0.23333333333333334
716
+ aggregate,holistic,biggen_mwr,biggen_240612,kendall,random,5,4,0.9999999999999999,0.016666666666666666
717
+ aggregate,holistic,biggen_mwr,biggen_240612,kendall,random,5,5,0.6,0.23333333333333334
718
+ aggregate,holistic,biggen_mwr,biggen_240612,kendall,random,5,6,0.7999999999999999,0.08333333333333333
719
+ aggregate,holistic,biggen_mwr,biggen_240612,kendall,random,5,7,0.9999999999999999,0.016666666666666666
720
+ aggregate,holistic,biggen_mwr,biggen_240612,kendall,random,5,8,0.7999999999999999,0.08333333333333333
721
+ aggregate,holistic,biggen_mwr,biggen_240612,kendall,random,5,9,0.7999999999999999,0.08333333333333333
cache/agreements_cache_a8b645e4d5ba862fbfa9ef3ecf73b44c.csv ADDED
@@ -0,0 +1,721 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ scenario,scenario_source,ref_scenario,ref_source,corr_type,model_select_strategy,model_subset_size_requested,exp_n,correlation,p_value
2
+ hellaswag,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,0,0.0,1.0
3
+ hellaswag,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,1,0.39999999999999997,0.48333333333333334
4
+ hellaswag,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,2,-0.19999999999999998,0.8166666666666667
5
+ hellaswag,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,3,0.6,0.23333333333333334
6
+ hellaswag,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,4,0.39999999999999997,0.48333333333333334
7
+ hellaswag,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,5,0.19999999999999998,0.8166666666666667
8
+ hellaswag,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,6,0.39999999999999997,0.48333333333333334
9
+ hellaswag,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,7,0.6,0.23333333333333334
10
+ hellaswag,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,8,0.39999999999999997,0.48333333333333334
11
+ hellaswag,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,9,0.6,0.23333333333333334
12
+ humaneval,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,0,-0.39999999999999997,0.48333333333333334
13
+ humaneval,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,1,-0.5270462766947298,0.206507295485425
14
+ humaneval,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,2,0.5270462766947298,0.206507295485425
15
+ humaneval,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,3,0.5270462766947298,0.206507295485425
16
+ humaneval,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,4,-0.22360679774997896,0.6015081344405899
17
+ humaneval,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,5,0.10540925533894598,0.8005421074231263
18
+ humaneval,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,6,-0.10540925533894598,0.8005421074231263
19
+ humaneval,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,7,0.39999999999999997,0.48333333333333334
20
+ humaneval,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,8,-0.10540925533894598,0.8005421074231263
21
+ humaneval,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,9,-0.22360679774997896,0.6015081344405899
22
+ mbpp,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,0,0.10540925533894598,0.8005421074231263
23
+ mbpp,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,1,-0.31622776601683794,0.44848886103153174
24
+ mbpp,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,2,-0.31622776601683794,0.44848886103153174
25
+ mbpp,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,3,0.10540925533894598,0.8005421074231263
26
+ mbpp,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,4,-0.39999999999999997,0.48333333333333334
27
+ mbpp,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,5,0.10540925533894598,0.8005421074231263
28
+ mbpp,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,6,0.39999999999999997,0.48333333333333334
29
+ mbpp,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,7,0.39999999999999997,0.48333333333333334
30
+ mbpp,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,8,0.10540925533894598,0.8005421074231263
31
+ mbpp,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,9,0.39999999999999997,0.48333333333333334
32
+ winogrande,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,0,-0.19999999999999998,0.8166666666666667
33
+ winogrande,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,1,0.7999999999999999,0.08333333333333333
34
+ winogrande,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,2,0.0,1.0
35
+ winogrande,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,3,-0.19999999999999998,0.8166666666666667
36
+ winogrande,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,4,0.31622776601683794,0.44848886103153174
37
+ winogrande,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,5,0.7999999999999999,0.08333333333333333
38
+ winogrande,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,6,-0.9999999999999999,0.016666666666666666
39
+ winogrande,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,7,0.7999999999999999,0.08333333333333333
40
+ winogrande,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,8,0.0,1.0
41
+ winogrande,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,9,-0.19999999999999998,0.8166666666666667
42
+ grounding,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,0,0.9486832980505137,0.02297740150320607
43
+ grounding,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,1,0.19999999999999998,0.8166666666666667
44
+ grounding,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,2,0.10540925533894598,0.8005421074231263
45
+ grounding,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,3,0.31622776601683794,0.44848886103153174
46
+ grounding,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,4,0.9486832980505137,0.02297740150320607
47
+ grounding,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,5,0.9486832980505137,0.02297740150320607
48
+ grounding,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,6,0.31622776601683794,0.44848886103153174
49
+ grounding,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,7,0.19999999999999998,0.8166666666666667
50
+ grounding,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,8,0.39999999999999997,0.48333333333333334
51
+ grounding,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,9,0.19999999999999998,0.8166666666666667
52
+ instruction_following,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,0,0.0,1.0
53
+ instruction_following,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,1,0.39999999999999997,0.48333333333333334
54
+ instruction_following,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,2,0.7378647873726218,0.07697417298126676
55
+ instruction_following,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,3,-0.39999999999999997,0.48333333333333334
56
+ instruction_following,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,4,0.8944271909999159,0.0367138563627041
57
+ instruction_following,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,5,0.9486832980505137,0.02297740150320607
58
+ instruction_following,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,6,0.9486832980505137,0.02297740150320607
59
+ instruction_following,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,7,0.0,1.0
60
+ instruction_following,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,8,0.8944271909999159,0.0367138563627041
61
+ instruction_following,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,9,0.7378647873726218,0.07697417298126676
62
+ planning,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,0,0.7378647873726218,0.07697417298126676
63
+ planning,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,1,0.7999999999999999,0.08333333333333333
64
+ planning,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,2,0.39999999999999997,0.48333333333333334
65
+ planning,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,3,0.39999999999999997,0.48333333333333334
66
+ planning,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,4,0.19999999999999998,0.8166666666666667
67
+ planning,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,5,0.39999999999999997,0.48333333333333334
68
+ planning,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,6,0.39999999999999997,0.48333333333333334
69
+ planning,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,7,0.7378647873726218,0.07697417298126676
70
+ planning,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,8,-0.10540925533894598,0.8005421074231263
71
+ planning,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,9,0.7378647873726218,0.07697417298126676
72
+ reasoning,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,0,0.7378647873726218,0.07697417298126676
73
+ reasoning,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,1,0.6,0.23333333333333334
74
+ reasoning,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,2,0.7378647873726218,0.07697417298126676
75
+ reasoning,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,3,0.6,0.23333333333333334
76
+ reasoning,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,4,0.5270462766947298,0.206507295485425
77
+ reasoning,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,5,0.7999999999999999,0.08333333333333333
78
+ reasoning,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,6,0.5270462766947298,0.206507295485425
79
+ reasoning,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,7,0.6,0.23333333333333334
80
+ reasoning,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,8,0.0,1.0
81
+ reasoning,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,9,0.39999999999999997,0.48333333333333334
82
+ refinement,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,0,0.6,0.23333333333333334
83
+ refinement,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,1,0.39999999999999997,0.48333333333333334
84
+ refinement,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,2,0.19999999999999998,0.8166666666666667
85
+ refinement,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,3,0.39999999999999997,0.48333333333333334
86
+ refinement,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,4,0.39999999999999997,0.48333333333333334
87
+ refinement,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,5,0.39999999999999997,0.48333333333333334
88
+ refinement,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,6,0.39999999999999997,0.48333333333333334
89
+ refinement,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,7,0.39999999999999997,0.48333333333333334
90
+ refinement,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,8,0.39999999999999997,0.48333333333333334
91
+ refinement,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,9,0.6,0.23333333333333334
92
+ safety,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,0,0.0,1.0
93
+ safety,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,1,0.0,1.0
94
+ safety,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,2,-0.19999999999999998,0.8166666666666667
95
+ safety,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,3,0.0,1.0
96
+ safety,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,4,-0.19999999999999998,0.8166666666666667
97
+ safety,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,5,-0.6,0.23333333333333334
98
+ safety,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,6,0.7999999999999999,0.08333333333333333
99
+ safety,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,7,-0.6,0.23333333333333334
100
+ safety,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,8,0.6,0.23333333333333334
101
+ safety,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,9,0.6,0.23333333333333334
102
+ theory_of_mind,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,0,0.19999999999999998,0.8166666666666667
103
+ theory_of_mind,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,1,0.39999999999999997,0.48333333333333334
104
+ theory_of_mind,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,2,0.6,0.23333333333333334
105
+ theory_of_mind,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,3,0.0,1.0
106
+ theory_of_mind,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,4,0.19999999999999998,0.8166666666666667
107
+ theory_of_mind,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,5,0.19999999999999998,0.8166666666666667
108
+ theory_of_mind,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,6,0.39999999999999997,0.48333333333333334
109
+ theory_of_mind,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,7,0.0,1.0
110
+ theory_of_mind,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,8,0.19999999999999998,0.8166666666666667
111
+ theory_of_mind,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,9,0.39999999999999997,0.48333333333333334
112
+ tool_usage,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,0,0.19999999999999998,0.8166666666666667
113
+ tool_usage,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,1,0.39999999999999997,0.48333333333333334
114
+ tool_usage,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,2,0.9999999999999999,0.016666666666666666
115
+ tool_usage,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,3,0.9999999999999999,0.016666666666666666
116
+ tool_usage,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,4,0.39999999999999997,0.48333333333333334
117
+ tool_usage,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,5,0.9999999999999999,0.016666666666666666
118
+ tool_usage,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,6,0.19999999999999998,0.8166666666666667
119
+ tool_usage,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,7,0.6,0.23333333333333334
120
+ tool_usage,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,8,0.7999999999999999,0.08333333333333333
121
+ tool_usage,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,9,0.39999999999999997,0.48333333333333334
122
+ livebench_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,0,0.6,0.23333333333333334
123
+ livebench_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,1,0.7999999999999999,0.08333333333333333
124
+ livebench_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,2,0.7999999999999999,0.08333333333333333
125
+ livebench_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,3,0.6,0.23333333333333334
126
+ livebench_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,4,0.6,0.23333333333333334
127
+ livebench_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,5,0.39999999999999997,0.48333333333333334
128
+ livebench_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,6,0.6,0.23333333333333334
129
+ livebench_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,7,0.7999999999999999,0.08333333333333333
130
+ livebench_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,8,0.39999999999999997,0.48333333333333334
131
+ livebench_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,9,0.39999999999999997,0.48333333333333334
132
+ reasoning_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,0,0.7378647873726218,0.07697417298126676
133
+ reasoning_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,1,0.5270462766947298,0.206507295485425
134
+ reasoning_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,2,0.7378647873726218,0.07697417298126676
135
+ reasoning_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,3,0.7378647873726218,0.07697417298126676
136
+ reasoning_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,4,0.5270462766947298,0.206507295485425
137
+ reasoning_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,5,0.7999999999999999,0.08333333333333333
138
+ reasoning_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,6,0.7378647873726218,0.07697417298126676
139
+ reasoning_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,7,0.7999999999999999,0.08333333333333333
140
+ reasoning_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,8,0.5270462766947298,0.206507295485425
141
+ reasoning_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,9,0.7378647873726218,0.07697417298126676
142
+ coding_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,0,0.31622776601683794,0.44848886103153174
143
+ coding_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,1,0.31622776601683794,0.44848886103153174
144
+ coding_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,2,0.6,0.23333333333333334
145
+ coding_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,3,0.31622776601683794,0.44848886103153174
146
+ coding_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,4,0.31622776601683794,0.44848886103153174
147
+ coding_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,5,0.5270462766947298,0.206507295485425
148
+ coding_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,6,0.5270462766947298,0.206507295485425
149
+ coding_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,7,0.5270462766947298,0.206507295485425
150
+ coding_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,8,0.5270462766947298,0.206507295485425
151
+ coding_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,9,0.5270462766947298,0.206507295485425
152
+ mathematics_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,0,0.6,0.23333333333333334
153
+ mathematics_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,1,0.6,0.23333333333333334
154
+ mathematics_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,2,0.6,0.23333333333333334
155
+ mathematics_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,3,0.6,0.23333333333333334
156
+ mathematics_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,4,0.6,0.23333333333333334
157
+ mathematics_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,5,0.6,0.23333333333333334
158
+ mathematics_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,6,0.6,0.23333333333333334
159
+ mathematics_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,7,0.6,0.23333333333333334
160
+ mathematics_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,8,0.6,0.23333333333333334
161
+ mathematics_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,9,0.6,0.23333333333333334
162
+ data_analysis_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,0,0.6,0.23333333333333334
163
+ data_analysis_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,1,0.6,0.23333333333333334
164
+ data_analysis_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,2,0.6,0.23333333333333334
165
+ data_analysis_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,3,0.6,0.23333333333333334
166
+ data_analysis_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,4,0.7999999999999999,0.08333333333333333
167
+ data_analysis_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,5,0.7999999999999999,0.08333333333333333
168
+ data_analysis_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,6,0.7999999999999999,0.08333333333333333
169
+ data_analysis_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,7,0.6,0.23333333333333334
170
+ data_analysis_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,8,0.7999999999999999,0.08333333333333333
171
+ data_analysis_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,9,0.7999999999999999,0.08333333333333333
172
+ language_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,0,0.6,0.23333333333333334
173
+ language_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,1,0.7999999999999999,0.08333333333333333
174
+ language_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,2,0.7999999999999999,0.08333333333333333
175
+ language_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,3,0.7999999999999999,0.08333333333333333
176
+ language_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,4,0.7999999999999999,0.08333333333333333
177
+ language_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,5,0.7999999999999999,0.08333333333333333
178
+ language_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,6,0.6,0.23333333333333334
179
+ language_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,7,0.6,0.23333333333333334
180
+ language_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,8,0.6,0.23333333333333334
181
+ language_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,9,0.6,0.23333333333333334
182
+ if_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,0,0.0,1.0
183
+ if_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,1,-0.6,0.23333333333333334
184
+ if_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,2,0.19999999999999998,0.8166666666666667
185
+ if_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,3,0.19999999999999998,0.8166666666666667
186
+ if_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,4,0.19999999999999998,0.8166666666666667
187
+ if_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,5,0.19999999999999998,0.8166666666666667
188
+ if_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,6,0.19999999999999998,0.8166666666666667
189
+ if_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,7,0.19999999999999998,0.8166666666666667
190
+ if_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,8,0.0,1.0
191
+ if_average,livebench_240701,aggregate,holistic,kendall,somewhere_aggregate,5,9,0.19999999999999998,0.8166666666666667
192
+ arena_hard,arena_hard_2404,aggregate,holistic,kendall,somewhere_aggregate,5,0,0.39999999999999997,0.48333333333333334
193
+ arena_hard,arena_hard_2404,aggregate,holistic,kendall,somewhere_aggregate,5,1,0.9999999999999999,0.016666666666666666
194
+ arena_hard,arena_hard_2404,aggregate,holistic,kendall,somewhere_aggregate,5,2,0.19999999999999998,0.8166666666666667
195
+ arena_hard,arena_hard_2404,aggregate,holistic,kendall,somewhere_aggregate,5,3,0.7999999999999999,0.08333333333333333
196
+ arena_hard,arena_hard_2404,aggregate,holistic,kendall,somewhere_aggregate,5,4,0.9999999999999999,0.016666666666666666
197
+ arena_hard,arena_hard_2404,aggregate,holistic,kendall,somewhere_aggregate,5,5,0.7999999999999999,0.08333333333333333
198
+ arena_hard,arena_hard_2404,aggregate,holistic,kendall,somewhere_aggregate,5,6,0.39999999999999997,0.48333333333333334
199
+ arena_hard,arena_hard_2404,aggregate,holistic,kendall,somewhere_aggregate,5,7,0.9999999999999999,0.016666666666666666
200
+ arena_hard,arena_hard_2404,aggregate,holistic,kendall,somewhere_aggregate,5,8,0.7999999999999999,0.08333333333333333
201
+ arena_hard,arena_hard_2404,aggregate,holistic,kendall,somewhere_aggregate,5,9,0.7999999999999999,0.08333333333333333
202
+ mixeval,mixeval_240601,aggregate,holistic,kendall,somewhere_aggregate,5,0,0.7999999999999999,0.08333333333333333
203
+ mixeval,mixeval_240601,aggregate,holistic,kendall,somewhere_aggregate,5,1,0.7999999999999999,0.08333333333333333
204
+ mixeval,mixeval_240601,aggregate,holistic,kendall,somewhere_aggregate,5,2,0.7999999999999999,0.08333333333333333
205
+ mixeval,mixeval_240601,aggregate,holistic,kendall,somewhere_aggregate,5,3,0.7999999999999999,0.08333333333333333
206
+ mixeval,mixeval_240601,aggregate,holistic,kendall,somewhere_aggregate,5,4,0.7999999999999999,0.08333333333333333
207
+ mixeval,mixeval_240601,aggregate,holistic,kendall,somewhere_aggregate,5,5,0.7999999999999999,0.08333333333333333
208
+ mixeval,mixeval_240601,aggregate,holistic,kendall,somewhere_aggregate,5,6,0.7999999999999999,0.08333333333333333
209
+ mixeval,mixeval_240601,aggregate,holistic,kendall,somewhere_aggregate,5,7,0.7999999999999999,0.08333333333333333
210
+ mixeval,mixeval_240601,aggregate,holistic,kendall,somewhere_aggregate,5,8,0.7999999999999999,0.08333333333333333
211
+ mixeval,mixeval_240601,aggregate,holistic,kendall,somewhere_aggregate,5,9,0.7999999999999999,0.08333333333333333
212
+ agieval,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,0,0.31622776601683794,0.44848886103153174
213
+ agieval,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,1,0.31622776601683794,0.44848886103153174
214
+ agieval,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,2,0.39999999999999997,0.48333333333333334
215
+ agieval,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,3,0.7999999999999999,0.08333333333333333
216
+ agieval,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,4,0.7999999999999999,0.08333333333333333
217
+ agieval,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,5,0.9999999999999999,0.016666666666666666
218
+ agieval,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,6,0.19999999999999998,0.8166666666666667
219
+ agieval,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,7,0.19999999999999998,0.8166666666666667
220
+ agieval,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,8,0.39999999999999997,0.48333333333333334
221
+ agieval,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,9,0.7999999999999999,0.08333333333333333
222
+ arc_c,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,0,0.6,0.23333333333333334
223
+ arc_c,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,1,0.39999999999999997,0.48333333333333334
224
+ arc_c,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,2,-0.19999999999999998,0.8166666666666667
225
+ arc_c,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,3,-0.39999999999999997,0.48333333333333334
226
+ arc_c,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,4,0.0,1.0
227
+ arc_c,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,5,0.6,0.23333333333333334
228
+ arc_c,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,6,0.31622776601683794,0.44848886103153174
229
+ arc_c,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,7,0.6,0.23333333333333334
230
+ arc_c,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,8,0.39999999999999997,0.48333333333333334
231
+ arc_c,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,9,0.7378647873726218,0.07697417298126676
232
+ alpacav1,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,0,-0.31622776601683794,0.44848886103153174
233
+ alpacav1,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,1,-0.7999999999999999,0.08333333333333333
234
+ alpacav1,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,2,0.7999999999999999,0.08333333333333333
235
+ alpacav1,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,3,0.7999999999999999,0.08333333333333333
236
+ alpacav1,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,4,0.39999999999999997,0.48333333333333334
237
+ alpacav1,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,5,0.39999999999999997,0.48333333333333334
238
+ alpacav1,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,6,0.6,0.23333333333333334
239
+ alpacav1,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,7,0.7378647873726218,0.07697417298126676
240
+ alpacav1,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,8,0.6,0.23333333333333334
241
+ alpacav1,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,9,0.31622776601683794,0.44848886103153174
242
+ alpacav2,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,0,0.39999999999999997,0.48333333333333334
243
+ alpacav2,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,1,0.19999999999999998,0.8166666666666667
244
+ alpacav2,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,2,0.39999999999999997,0.48333333333333334
245
+ alpacav2,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,3,0.7378647873726218,0.07697417298126676
246
+ alpacav2,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,4,0.39999999999999997,0.48333333333333334
247
+ alpacav2,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,5,-0.6,0.23333333333333334
248
+ alpacav2,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,6,-0.39999999999999997,0.48333333333333334
249
+ alpacav2,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,7,0.6,0.23333333333333334
250
+ alpacav2,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,8,0.0,1.0
251
+ alpacav2,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,9,0.6,0.23333333333333334
252
+ alpacaeval2_lc,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,0,0.0,1.0
253
+ alpacaeval2_lc,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,1,-0.6,0.23333333333333334
254
+ alpacaeval2_lc,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,2,0.7999999999999999,0.08333333333333333
255
+ alpacaeval2_lc,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,3,0.0,1.0
256
+ alpacaeval2_lc,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,4,0.6,0.23333333333333334
257
+ alpacaeval2_lc,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,5,0.39999999999999997,0.48333333333333334
258
+ alpacaeval2_lc,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,6,0.39999999999999997,0.48333333333333334
259
+ alpacaeval2_lc,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,7,0.6,0.23333333333333334
260
+ alpacaeval2_lc,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,8,-0.39999999999999997,0.48333333333333334
261
+ alpacaeval2_lc,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,9,0.0,1.0
262
+ arena_elo,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,0,0.31622776601683794,0.44848886103153174
263
+ arena_elo,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,1,0.6,0.23333333333333334
264
+ arena_elo,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,2,0.31622776601683794,0.44848886103153174
265
+ arena_elo,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,3,-0.9999999999999999,0.016666666666666666
266
+ arena_elo,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,4,0.39999999999999997,0.48333333333333334
267
+ arena_elo,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,5,0.31622776601683794,0.44848886103153174
268
+ arena_elo,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,6,-0.9999999999999999,0.016666666666666666
269
+ arena_elo,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,7,0.6,0.23333333333333334
270
+ arena_elo,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,8,-0.10540925533894598,0.8005421074231263
271
+ arena_elo,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,9,0.6,0.23333333333333334
272
+ bbh,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,0,0.9486832980505137,0.02297740150320607
273
+ bbh,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,1,0.31622776601683794,0.44848886103153174
274
+ bbh,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,2,-0.39999999999999997,0.48333333333333334
275
+ bbh,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,3,0.9999999999999999,0.016666666666666666
276
+ bbh,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,4,0.0,1.0
277
+ bbh,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,5,-0.39999999999999997,0.48333333333333334
278
+ bbh,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,6,0.7999999999999999,0.08333333333333333
279
+ bbh,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,7,0.7999999999999999,0.08333333333333333
280
+ bbh,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,8,0.5270462766947298,0.206507295485425
281
+ bbh,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,9,0.5270462766947298,0.206507295485425
282
+ eq_benchv2,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,0,-0.19999999999999998,0.8166666666666667
283
+ eq_benchv2,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,1,0.19999999999999998,0.8166666666666667
284
+ eq_benchv2,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,2,0.0,1.0
285
+ eq_benchv2,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,3,0.0,1.0
286
+ eq_benchv2,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,4,-0.39999999999999997,0.48333333333333334
287
+ eq_benchv2,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,5,0.39999999999999997,0.48333333333333334
288
+ eq_benchv2,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,6,0.39999999999999997,0.48333333333333334
289
+ eq_benchv2,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,7,0.7999999999999999,0.08333333333333333
290
+ eq_benchv2,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,8,0.19999999999999998,0.8166666666666667
291
+ eq_benchv2,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,9,0.7999999999999999,0.08333333333333333
292
+ gpt4all,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,0,0.7378647873726218,0.07697417298126676
293
+ gpt4all,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,1,0.39999999999999997,0.48333333333333334
294
+ gpt4all,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,2,0.31622776601683794,0.44848886103153174
295
+ gpt4all,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,3,0.39999999999999997,0.48333333333333334
296
+ gpt4all,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,4,0.31622776601683794,0.44848886103153174
297
+ gpt4all,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,5,0.39999999999999997,0.48333333333333334
298
+ gpt4all,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,6,-0.39999999999999997,0.48333333333333334
299
+ gpt4all,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,7,-0.39999999999999997,0.48333333333333334
300
+ gpt4all,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,8,0.39999999999999997,0.48333333333333334
301
+ gpt4all,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,9,0.5270462766947298,0.206507295485425
302
+ hugging_6,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,0,-0.6,0.23333333333333334
303
+ hugging_6,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,1,-0.6,0.23333333333333334
304
+ hugging_6,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,2,0.19999999999999998,0.8166666666666667
305
+ hugging_6,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,3,0.39999999999999997,0.48333333333333334
306
+ hugging_6,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,4,0.19999999999999998,0.8166666666666667
307
+ hugging_6,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,5,0.6,0.23333333333333334
308
+ hugging_6,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,6,-0.39999999999999997,0.48333333333333334
309
+ hugging_6,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,7,-0.6,0.23333333333333334
310
+ hugging_6,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,8,-0.39999999999999997,0.48333333333333334
311
+ hugging_6,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,9,0.6,0.23333333333333334
312
+ llmonitor,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,0,0.11952286093343936,0.7815112949987133
313
+ llmonitor,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,1,0.0,1.0
314
+ llmonitor,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,2,0.0,1.0
315
+ llmonitor,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,3,0.6,0.23333333333333334
316
+ llmonitor,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,4,0.39999999999999997,0.48333333333333334
317
+ llmonitor,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,5,0.6,0.23333333333333334
318
+ llmonitor,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,6,0.35856858280031806,0.40538055645894233
319
+ llmonitor,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,7,0.31622776601683794,0.44848886103153174
320
+ llmonitor,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,8,0.0,1.0
321
+ llmonitor,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,9,0.5270462766947298,0.206507295485425
322
+ magi,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,0,0.39999999999999997,0.48333333333333334
323
+ magi,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,1,0.19999999999999998,0.8166666666666667
324
+ magi,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,2,-0.10540925533894598,0.8005421074231263
325
+ magi,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,3,0.39999999999999997,0.48333333333333334
326
+ magi,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,4,0.6,0.23333333333333334
327
+ magi,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,5,0.7999999999999999,0.08333333333333333
328
+ magi,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,6,0.6,0.23333333333333334
329
+ magi,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,7,0.7999999999999999,0.08333333333333333
330
+ magi,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,8,0.19999999999999998,0.8166666666666667
331
+ magi,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,9,-0.19999999999999998,0.8166666666666667
332
+ mmlu,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,0,0.6,0.23333333333333334
333
+ mmlu,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,1,0.39999999999999997,0.48333333333333334
334
+ mmlu,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,2,0.0,1.0
335
+ mmlu,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,3,0.39999999999999997,0.48333333333333334
336
+ mmlu,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,4,0.6,0.23333333333333334
337
+ mmlu,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,5,0.6,0.23333333333333334
338
+ mmlu,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,6,0.6,0.23333333333333334
339
+ mmlu,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,7,0.6,0.23333333333333334
340
+ mmlu,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,8,0.6,0.23333333333333334
341
+ mmlu,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,9,0.5270462766947298,0.206507295485425
342
+ mt_bench,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,0,0.6,0.23333333333333334
343
+ mt_bench,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,1,0.0,1.0
344
+ mt_bench,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,2,0.10540925533894598,0.8005421074231263
345
+ mt_bench,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,3,0.39999999999999997,0.48333333333333334
346
+ mt_bench,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,4,0.39999999999999997,0.48333333333333334
347
+ mt_bench,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,5,0.19999999999999998,0.8166666666666667
348
+ mt_bench,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,6,0.10540925533894598,0.8005421074231263
349
+ mt_bench,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,7,-0.39999999999999997,0.48333333333333334
350
+ mt_bench,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,8,0.19999999999999998,0.8166666666666667
351
+ mt_bench,BLZ_240312,aggregate,holistic,kendall,somewhere_aggregate,5,9,0.39999999999999997,0.48333333333333334
352
+ biggen_mwr,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,0,0.39999999999999997,0.48333333333333334
353
+ biggen_mwr,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,1,0.39999999999999997,0.48333333333333334
354
+ biggen_mwr,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,2,0.19999999999999998,0.8166666666666667
355
+ biggen_mwr,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,3,0.6,0.23333333333333334
356
+ biggen_mwr,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,4,0.6,0.23333333333333334
357
+ biggen_mwr,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,5,0.39999999999999997,0.48333333333333334
358
+ biggen_mwr,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,6,0.39999999999999997,0.48333333333333334
359
+ biggen_mwr,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,7,0.39999999999999997,0.48333333333333334
360
+ biggen_mwr,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,8,0.6,0.23333333333333334
361
+ biggen_mwr,biggen_240612,aggregate,holistic,kendall,somewhere_aggregate,5,9,0.7999999999999999,0.08333333333333333
362
+ aggregate,holistic,hellaswag,BLZ_240312,kendall,somewhere_aggregate,5,0,0.0,1.0
363
+ aggregate,holistic,hellaswag,BLZ_240312,kendall,somewhere_aggregate,5,1,0.39999999999999997,0.48333333333333334
364
+ aggregate,holistic,hellaswag,BLZ_240312,kendall,somewhere_aggregate,5,2,-0.19999999999999998,0.8166666666666667
365
+ aggregate,holistic,hellaswag,BLZ_240312,kendall,somewhere_aggregate,5,3,0.6,0.23333333333333334
366
+ aggregate,holistic,hellaswag,BLZ_240312,kendall,somewhere_aggregate,5,4,0.39999999999999997,0.48333333333333334
367
+ aggregate,holistic,hellaswag,BLZ_240312,kendall,somewhere_aggregate,5,5,0.19999999999999998,0.8166666666666667
368
+ aggregate,holistic,hellaswag,BLZ_240312,kendall,somewhere_aggregate,5,6,0.39999999999999997,0.48333333333333334
369
+ aggregate,holistic,hellaswag,BLZ_240312,kendall,somewhere_aggregate,5,7,0.6,0.23333333333333334
370
+ aggregate,holistic,hellaswag,BLZ_240312,kendall,somewhere_aggregate,5,8,0.39999999999999997,0.48333333333333334
371
+ aggregate,holistic,hellaswag,BLZ_240312,kendall,somewhere_aggregate,5,9,0.6,0.23333333333333334
372
+ aggregate,holistic,humaneval,BLZ_240312,kendall,somewhere_aggregate,5,0,-0.39999999999999997,0.48333333333333334
373
+ aggregate,holistic,humaneval,BLZ_240312,kendall,somewhere_aggregate,5,1,-0.5270462766947298,0.206507295485425
374
+ aggregate,holistic,humaneval,BLZ_240312,kendall,somewhere_aggregate,5,2,0.5270462766947298,0.206507295485425
375
+ aggregate,holistic,humaneval,BLZ_240312,kendall,somewhere_aggregate,5,3,0.5270462766947298,0.206507295485425
376
+ aggregate,holistic,humaneval,BLZ_240312,kendall,somewhere_aggregate,5,4,-0.22360679774997896,0.6015081344405899
377
+ aggregate,holistic,humaneval,BLZ_240312,kendall,somewhere_aggregate,5,5,0.10540925533894598,0.8005421074231263
378
+ aggregate,holistic,humaneval,BLZ_240312,kendall,somewhere_aggregate,5,6,-0.10540925533894598,0.8005421074231263
379
+ aggregate,holistic,humaneval,BLZ_240312,kendall,somewhere_aggregate,5,7,0.39999999999999997,0.48333333333333334
380
+ aggregate,holistic,humaneval,BLZ_240312,kendall,somewhere_aggregate,5,8,-0.10540925533894598,0.8005421074231263
381
+ aggregate,holistic,humaneval,BLZ_240312,kendall,somewhere_aggregate,5,9,-0.22360679774997896,0.6015081344405899
382
+ aggregate,holistic,mbpp,BLZ_240312,kendall,somewhere_aggregate,5,0,0.10540925533894598,0.8005421074231263
383
+ aggregate,holistic,mbpp,BLZ_240312,kendall,somewhere_aggregate,5,1,-0.31622776601683794,0.44848886103153174
384
+ aggregate,holistic,mbpp,BLZ_240312,kendall,somewhere_aggregate,5,2,-0.31622776601683794,0.44848886103153174
385
+ aggregate,holistic,mbpp,BLZ_240312,kendall,somewhere_aggregate,5,3,0.10540925533894598,0.8005421074231263
386
+ aggregate,holistic,mbpp,BLZ_240312,kendall,somewhere_aggregate,5,4,-0.39999999999999997,0.48333333333333334
387
+ aggregate,holistic,mbpp,BLZ_240312,kendall,somewhere_aggregate,5,5,0.10540925533894598,0.8005421074231263
388
+ aggregate,holistic,mbpp,BLZ_240312,kendall,somewhere_aggregate,5,6,0.39999999999999997,0.48333333333333334
389
+ aggregate,holistic,mbpp,BLZ_240312,kendall,somewhere_aggregate,5,7,0.39999999999999997,0.48333333333333334
390
+ aggregate,holistic,mbpp,BLZ_240312,kendall,somewhere_aggregate,5,8,0.10540925533894598,0.8005421074231263
391
+ aggregate,holistic,mbpp,BLZ_240312,kendall,somewhere_aggregate,5,9,0.39999999999999997,0.48333333333333334
392
+ aggregate,holistic,winogrande,BLZ_240312,kendall,somewhere_aggregate,5,0,-0.19999999999999998,0.8166666666666667
393
+ aggregate,holistic,winogrande,BLZ_240312,kendall,somewhere_aggregate,5,1,0.7999999999999999,0.08333333333333333
394
+ aggregate,holistic,winogrande,BLZ_240312,kendall,somewhere_aggregate,5,2,0.0,1.0
395
+ aggregate,holistic,winogrande,BLZ_240312,kendall,somewhere_aggregate,5,3,-0.19999999999999998,0.8166666666666667
396
+ aggregate,holistic,winogrande,BLZ_240312,kendall,somewhere_aggregate,5,4,0.31622776601683794,0.44848886103153174
397
+ aggregate,holistic,winogrande,BLZ_240312,kendall,somewhere_aggregate,5,5,0.7999999999999999,0.08333333333333333
398
+ aggregate,holistic,winogrande,BLZ_240312,kendall,somewhere_aggregate,5,6,-0.9999999999999999,0.016666666666666666
399
+ aggregate,holistic,winogrande,BLZ_240312,kendall,somewhere_aggregate,5,7,0.7999999999999999,0.08333333333333333
400
+ aggregate,holistic,winogrande,BLZ_240312,kendall,somewhere_aggregate,5,8,0.0,1.0
401
+ aggregate,holistic,winogrande,BLZ_240312,kendall,somewhere_aggregate,5,9,-0.19999999999999998,0.8166666666666667
402
+ aggregate,holistic,grounding,biggen_240612,kendall,somewhere_aggregate,5,0,0.9486832980505137,0.02297740150320607
403
+ aggregate,holistic,grounding,biggen_240612,kendall,somewhere_aggregate,5,1,0.19999999999999998,0.8166666666666667
404
+ aggregate,holistic,grounding,biggen_240612,kendall,somewhere_aggregate,5,2,0.10540925533894598,0.8005421074231263
405
+ aggregate,holistic,grounding,biggen_240612,kendall,somewhere_aggregate,5,3,0.31622776601683794,0.44848886103153174
406
+ aggregate,holistic,grounding,biggen_240612,kendall,somewhere_aggregate,5,4,0.9486832980505137,0.02297740150320607
407
+ aggregate,holistic,grounding,biggen_240612,kendall,somewhere_aggregate,5,5,0.9486832980505137,0.02297740150320607
408
+ aggregate,holistic,grounding,biggen_240612,kendall,somewhere_aggregate,5,6,0.31622776601683794,0.44848886103153174
409
+ aggregate,holistic,grounding,biggen_240612,kendall,somewhere_aggregate,5,7,0.19999999999999998,0.8166666666666667
410
+ aggregate,holistic,grounding,biggen_240612,kendall,somewhere_aggregate,5,8,0.39999999999999997,0.48333333333333334
411
+ aggregate,holistic,grounding,biggen_240612,kendall,somewhere_aggregate,5,9,0.19999999999999998,0.8166666666666667
412
+ aggregate,holistic,instruction_following,biggen_240612,kendall,somewhere_aggregate,5,0,0.0,1.0
413
+ aggregate,holistic,instruction_following,biggen_240612,kendall,somewhere_aggregate,5,1,0.39999999999999997,0.48333333333333334
414
+ aggregate,holistic,instruction_following,biggen_240612,kendall,somewhere_aggregate,5,2,0.7378647873726218,0.07697417298126676
415
+ aggregate,holistic,instruction_following,biggen_240612,kendall,somewhere_aggregate,5,3,-0.39999999999999997,0.48333333333333334
416
+ aggregate,holistic,instruction_following,biggen_240612,kendall,somewhere_aggregate,5,4,0.8944271909999159,0.0367138563627041
417
+ aggregate,holistic,instruction_following,biggen_240612,kendall,somewhere_aggregate,5,5,0.9486832980505137,0.02297740150320607
418
+ aggregate,holistic,instruction_following,biggen_240612,kendall,somewhere_aggregate,5,6,0.9486832980505137,0.02297740150320607
419
+ aggregate,holistic,instruction_following,biggen_240612,kendall,somewhere_aggregate,5,7,0.0,1.0
420
+ aggregate,holistic,instruction_following,biggen_240612,kendall,somewhere_aggregate,5,8,0.8944271909999159,0.0367138563627041
421
+ aggregate,holistic,instruction_following,biggen_240612,kendall,somewhere_aggregate,5,9,0.7378647873726218,0.07697417298126676
422
+ aggregate,holistic,planning,biggen_240612,kendall,somewhere_aggregate,5,0,0.7378647873726218,0.07697417298126676
423
+ aggregate,holistic,planning,biggen_240612,kendall,somewhere_aggregate,5,1,0.7999999999999999,0.08333333333333333
424
+ aggregate,holistic,planning,biggen_240612,kendall,somewhere_aggregate,5,2,0.39999999999999997,0.48333333333333334
425
+ aggregate,holistic,planning,biggen_240612,kendall,somewhere_aggregate,5,3,0.39999999999999997,0.48333333333333334
426
+ aggregate,holistic,planning,biggen_240612,kendall,somewhere_aggregate,5,4,0.19999999999999998,0.8166666666666667
427
+ aggregate,holistic,planning,biggen_240612,kendall,somewhere_aggregate,5,5,0.39999999999999997,0.48333333333333334
428
+ aggregate,holistic,planning,biggen_240612,kendall,somewhere_aggregate,5,6,0.39999999999999997,0.48333333333333334
429
+ aggregate,holistic,planning,biggen_240612,kendall,somewhere_aggregate,5,7,0.7378647873726218,0.07697417298126676
430
+ aggregate,holistic,planning,biggen_240612,kendall,somewhere_aggregate,5,8,-0.10540925533894598,0.8005421074231263
431
+ aggregate,holistic,planning,biggen_240612,kendall,somewhere_aggregate,5,9,0.7378647873726218,0.07697417298126676
432
+ aggregate,holistic,reasoning,biggen_240612,kendall,somewhere_aggregate,5,0,0.7378647873726218,0.07697417298126676
433
+ aggregate,holistic,reasoning,biggen_240612,kendall,somewhere_aggregate,5,1,0.6,0.23333333333333334
434
+ aggregate,holistic,reasoning,biggen_240612,kendall,somewhere_aggregate,5,2,0.7378647873726218,0.07697417298126676
435
+ aggregate,holistic,reasoning,biggen_240612,kendall,somewhere_aggregate,5,3,0.6,0.23333333333333334
436
+ aggregate,holistic,reasoning,biggen_240612,kendall,somewhere_aggregate,5,4,0.5270462766947298,0.206507295485425
437
+ aggregate,holistic,reasoning,biggen_240612,kendall,somewhere_aggregate,5,5,0.7999999999999999,0.08333333333333333
438
+ aggregate,holistic,reasoning,biggen_240612,kendall,somewhere_aggregate,5,6,0.5270462766947298,0.206507295485425
439
+ aggregate,holistic,reasoning,biggen_240612,kendall,somewhere_aggregate,5,7,0.6,0.23333333333333334
440
+ aggregate,holistic,reasoning,biggen_240612,kendall,somewhere_aggregate,5,8,0.0,1.0
441
+ aggregate,holistic,reasoning,biggen_240612,kendall,somewhere_aggregate,5,9,0.39999999999999997,0.48333333333333334
442
+ aggregate,holistic,refinement,biggen_240612,kendall,somewhere_aggregate,5,0,0.6,0.23333333333333334
443
+ aggregate,holistic,refinement,biggen_240612,kendall,somewhere_aggregate,5,1,0.39999999999999997,0.48333333333333334
444
+ aggregate,holistic,refinement,biggen_240612,kendall,somewhere_aggregate,5,2,0.19999999999999998,0.8166666666666667
445
+ aggregate,holistic,refinement,biggen_240612,kendall,somewhere_aggregate,5,3,0.39999999999999997,0.48333333333333334
446
+ aggregate,holistic,refinement,biggen_240612,kendall,somewhere_aggregate,5,4,0.39999999999999997,0.48333333333333334
447
+ aggregate,holistic,refinement,biggen_240612,kendall,somewhere_aggregate,5,5,0.39999999999999997,0.48333333333333334
448
+ aggregate,holistic,refinement,biggen_240612,kendall,somewhere_aggregate,5,6,0.39999999999999997,0.48333333333333334
449
+ aggregate,holistic,refinement,biggen_240612,kendall,somewhere_aggregate,5,7,0.39999999999999997,0.48333333333333334
450
+ aggregate,holistic,refinement,biggen_240612,kendall,somewhere_aggregate,5,8,0.39999999999999997,0.48333333333333334
451
+ aggregate,holistic,refinement,biggen_240612,kendall,somewhere_aggregate,5,9,0.6,0.23333333333333334
452
+ aggregate,holistic,safety,biggen_240612,kendall,somewhere_aggregate,5,0,0.0,1.0
453
+ aggregate,holistic,safety,biggen_240612,kendall,somewhere_aggregate,5,1,0.0,1.0
454
+ aggregate,holistic,safety,biggen_240612,kendall,somewhere_aggregate,5,2,-0.19999999999999998,0.8166666666666667
455
+ aggregate,holistic,safety,biggen_240612,kendall,somewhere_aggregate,5,3,0.0,1.0
456
+ aggregate,holistic,safety,biggen_240612,kendall,somewhere_aggregate,5,4,-0.19999999999999998,0.8166666666666667
457
+ aggregate,holistic,safety,biggen_240612,kendall,somewhere_aggregate,5,5,-0.6,0.23333333333333334
458
+ aggregate,holistic,safety,biggen_240612,kendall,somewhere_aggregate,5,6,0.7999999999999999,0.08333333333333333
459
+ aggregate,holistic,safety,biggen_240612,kendall,somewhere_aggregate,5,7,-0.6,0.23333333333333334
460
+ aggregate,holistic,safety,biggen_240612,kendall,somewhere_aggregate,5,8,0.6,0.23333333333333334
461
+ aggregate,holistic,safety,biggen_240612,kendall,somewhere_aggregate,5,9,0.6,0.23333333333333334
462
+ aggregate,holistic,theory_of_mind,biggen_240612,kendall,somewhere_aggregate,5,0,0.19999999999999998,0.8166666666666667
463
+ aggregate,holistic,theory_of_mind,biggen_240612,kendall,somewhere_aggregate,5,1,0.39999999999999997,0.48333333333333334
464
+ aggregate,holistic,theory_of_mind,biggen_240612,kendall,somewhere_aggregate,5,2,0.6,0.23333333333333334
465
+ aggregate,holistic,theory_of_mind,biggen_240612,kendall,somewhere_aggregate,5,3,0.0,1.0
466
+ aggregate,holistic,theory_of_mind,biggen_240612,kendall,somewhere_aggregate,5,4,0.19999999999999998,0.8166666666666667
467
+ aggregate,holistic,theory_of_mind,biggen_240612,kendall,somewhere_aggregate,5,5,0.19999999999999998,0.8166666666666667
468
+ aggregate,holistic,theory_of_mind,biggen_240612,kendall,somewhere_aggregate,5,6,0.39999999999999997,0.48333333333333334
469
+ aggregate,holistic,theory_of_mind,biggen_240612,kendall,somewhere_aggregate,5,7,0.0,1.0
470
+ aggregate,holistic,theory_of_mind,biggen_240612,kendall,somewhere_aggregate,5,8,0.19999999999999998,0.8166666666666667
471
+ aggregate,holistic,theory_of_mind,biggen_240612,kendall,somewhere_aggregate,5,9,0.39999999999999997,0.48333333333333334
472
+ aggregate,holistic,tool_usage,biggen_240612,kendall,somewhere_aggregate,5,0,0.19999999999999998,0.8166666666666667
473
+ aggregate,holistic,tool_usage,biggen_240612,kendall,somewhere_aggregate,5,1,0.39999999999999997,0.48333333333333334
474
+ aggregate,holistic,tool_usage,biggen_240612,kendall,somewhere_aggregate,5,2,0.9999999999999999,0.016666666666666666
475
+ aggregate,holistic,tool_usage,biggen_240612,kendall,somewhere_aggregate,5,3,0.9999999999999999,0.016666666666666666
476
+ aggregate,holistic,tool_usage,biggen_240612,kendall,somewhere_aggregate,5,4,0.39999999999999997,0.48333333333333334
477
+ aggregate,holistic,tool_usage,biggen_240612,kendall,somewhere_aggregate,5,5,0.9999999999999999,0.016666666666666666
478
+ aggregate,holistic,tool_usage,biggen_240612,kendall,somewhere_aggregate,5,6,0.19999999999999998,0.8166666666666667
479
+ aggregate,holistic,tool_usage,biggen_240612,kendall,somewhere_aggregate,5,7,0.6,0.23333333333333334
480
+ aggregate,holistic,tool_usage,biggen_240612,kendall,somewhere_aggregate,5,8,0.7999999999999999,0.08333333333333333
481
+ aggregate,holistic,tool_usage,biggen_240612,kendall,somewhere_aggregate,5,9,0.39999999999999997,0.48333333333333334
482
+ aggregate,holistic,livebench_average,livebench_240701,kendall,somewhere_aggregate,5,0,0.6,0.23333333333333334
483
+ aggregate,holistic,livebench_average,livebench_240701,kendall,somewhere_aggregate,5,1,0.7999999999999999,0.08333333333333333
484
+ aggregate,holistic,livebench_average,livebench_240701,kendall,somewhere_aggregate,5,2,0.7999999999999999,0.08333333333333333
485
+ aggregate,holistic,livebench_average,livebench_240701,kendall,somewhere_aggregate,5,3,0.6,0.23333333333333334
486
+ aggregate,holistic,livebench_average,livebench_240701,kendall,somewhere_aggregate,5,4,0.6,0.23333333333333334
487
+ aggregate,holistic,livebench_average,livebench_240701,kendall,somewhere_aggregate,5,5,0.39999999999999997,0.48333333333333334
488
+ aggregate,holistic,livebench_average,livebench_240701,kendall,somewhere_aggregate,5,6,0.6,0.23333333333333334
489
+ aggregate,holistic,livebench_average,livebench_240701,kendall,somewhere_aggregate,5,7,0.7999999999999999,0.08333333333333333
490
+ aggregate,holistic,livebench_average,livebench_240701,kendall,somewhere_aggregate,5,8,0.39999999999999997,0.48333333333333334
491
+ aggregate,holistic,livebench_average,livebench_240701,kendall,somewhere_aggregate,5,9,0.39999999999999997,0.48333333333333334
492
+ aggregate,holistic,reasoning_average,livebench_240701,kendall,somewhere_aggregate,5,0,0.7378647873726218,0.07697417298126676
493
+ aggregate,holistic,reasoning_average,livebench_240701,kendall,somewhere_aggregate,5,1,0.5270462766947298,0.206507295485425
494
+ aggregate,holistic,reasoning_average,livebench_240701,kendall,somewhere_aggregate,5,2,0.7378647873726218,0.07697417298126676
495
+ aggregate,holistic,reasoning_average,livebench_240701,kendall,somewhere_aggregate,5,3,0.7378647873726218,0.07697417298126676
496
+ aggregate,holistic,reasoning_average,livebench_240701,kendall,somewhere_aggregate,5,4,0.5270462766947298,0.206507295485425
497
+ aggregate,holistic,reasoning_average,livebench_240701,kendall,somewhere_aggregate,5,5,0.7999999999999999,0.08333333333333333
498
+ aggregate,holistic,reasoning_average,livebench_240701,kendall,somewhere_aggregate,5,6,0.7378647873726218,0.07697417298126676
499
+ aggregate,holistic,reasoning_average,livebench_240701,kendall,somewhere_aggregate,5,7,0.7999999999999999,0.08333333333333333
500
+ aggregate,holistic,reasoning_average,livebench_240701,kendall,somewhere_aggregate,5,8,0.5270462766947298,0.206507295485425
501
+ aggregate,holistic,reasoning_average,livebench_240701,kendall,somewhere_aggregate,5,9,0.7378647873726218,0.07697417298126676
502
+ aggregate,holistic,coding_average,livebench_240701,kendall,somewhere_aggregate,5,0,0.31622776601683794,0.44848886103153174
503
+ aggregate,holistic,coding_average,livebench_240701,kendall,somewhere_aggregate,5,1,0.31622776601683794,0.44848886103153174
504
+ aggregate,holistic,coding_average,livebench_240701,kendall,somewhere_aggregate,5,2,0.6,0.23333333333333334
505
+ aggregate,holistic,coding_average,livebench_240701,kendall,somewhere_aggregate,5,3,0.31622776601683794,0.44848886103153174
506
+ aggregate,holistic,coding_average,livebench_240701,kendall,somewhere_aggregate,5,4,0.31622776601683794,0.44848886103153174
507
+ aggregate,holistic,coding_average,livebench_240701,kendall,somewhere_aggregate,5,5,0.5270462766947298,0.206507295485425
508
+ aggregate,holistic,coding_average,livebench_240701,kendall,somewhere_aggregate,5,6,0.5270462766947298,0.206507295485425
509
+ aggregate,holistic,coding_average,livebench_240701,kendall,somewhere_aggregate,5,7,0.5270462766947298,0.206507295485425
510
+ aggregate,holistic,coding_average,livebench_240701,kendall,somewhere_aggregate,5,8,0.5270462766947298,0.206507295485425
511
+ aggregate,holistic,coding_average,livebench_240701,kendall,somewhere_aggregate,5,9,0.5270462766947298,0.206507295485425
512
+ aggregate,holistic,mathematics_average,livebench_240701,kendall,somewhere_aggregate,5,0,0.6,0.23333333333333334
513
+ aggregate,holistic,mathematics_average,livebench_240701,kendall,somewhere_aggregate,5,1,0.6,0.23333333333333334
514
+ aggregate,holistic,mathematics_average,livebench_240701,kendall,somewhere_aggregate,5,2,0.6,0.23333333333333334
515
+ aggregate,holistic,mathematics_average,livebench_240701,kendall,somewhere_aggregate,5,3,0.6,0.23333333333333334
516
+ aggregate,holistic,mathematics_average,livebench_240701,kendall,somewhere_aggregate,5,4,0.6,0.23333333333333334
517
+ aggregate,holistic,mathematics_average,livebench_240701,kendall,somewhere_aggregate,5,5,0.6,0.23333333333333334
518
+ aggregate,holistic,mathematics_average,livebench_240701,kendall,somewhere_aggregate,5,6,0.6,0.23333333333333334
519
+ aggregate,holistic,mathematics_average,livebench_240701,kendall,somewhere_aggregate,5,7,0.6,0.23333333333333334
520
+ aggregate,holistic,mathematics_average,livebench_240701,kendall,somewhere_aggregate,5,8,0.6,0.23333333333333334
521
+ aggregate,holistic,mathematics_average,livebench_240701,kendall,somewhere_aggregate,5,9,0.6,0.23333333333333334
522
+ aggregate,holistic,data_analysis_average,livebench_240701,kendall,somewhere_aggregate,5,0,0.6,0.23333333333333334
523
+ aggregate,holistic,data_analysis_average,livebench_240701,kendall,somewhere_aggregate,5,1,0.6,0.23333333333333334
524
+ aggregate,holistic,data_analysis_average,livebench_240701,kendall,somewhere_aggregate,5,2,0.6,0.23333333333333334
525
+ aggregate,holistic,data_analysis_average,livebench_240701,kendall,somewhere_aggregate,5,3,0.6,0.23333333333333334
526
+ aggregate,holistic,data_analysis_average,livebench_240701,kendall,somewhere_aggregate,5,4,0.7999999999999999,0.08333333333333333
527
+ aggregate,holistic,data_analysis_average,livebench_240701,kendall,somewhere_aggregate,5,5,0.7999999999999999,0.08333333333333333
528
+ aggregate,holistic,data_analysis_average,livebench_240701,kendall,somewhere_aggregate,5,6,0.7999999999999999,0.08333333333333333
529
+ aggregate,holistic,data_analysis_average,livebench_240701,kendall,somewhere_aggregate,5,7,0.6,0.23333333333333334
530
+ aggregate,holistic,data_analysis_average,livebench_240701,kendall,somewhere_aggregate,5,8,0.7999999999999999,0.08333333333333333
531
+ aggregate,holistic,data_analysis_average,livebench_240701,kendall,somewhere_aggregate,5,9,0.7999999999999999,0.08333333333333333
532
+ aggregate,holistic,language_average,livebench_240701,kendall,somewhere_aggregate,5,0,0.6,0.23333333333333334
533
+ aggregate,holistic,language_average,livebench_240701,kendall,somewhere_aggregate,5,1,0.7999999999999999,0.08333333333333333
534
+ aggregate,holistic,language_average,livebench_240701,kendall,somewhere_aggregate,5,2,0.7999999999999999,0.08333333333333333
535
+ aggregate,holistic,language_average,livebench_240701,kendall,somewhere_aggregate,5,3,0.7999999999999999,0.08333333333333333
536
+ aggregate,holistic,language_average,livebench_240701,kendall,somewhere_aggregate,5,4,0.7999999999999999,0.08333333333333333
537
+ aggregate,holistic,language_average,livebench_240701,kendall,somewhere_aggregate,5,5,0.7999999999999999,0.08333333333333333
538
+ aggregate,holistic,language_average,livebench_240701,kendall,somewhere_aggregate,5,6,0.6,0.23333333333333334
539
+ aggregate,holistic,language_average,livebench_240701,kendall,somewhere_aggregate,5,7,0.6,0.23333333333333334
540
+ aggregate,holistic,language_average,livebench_240701,kendall,somewhere_aggregate,5,8,0.6,0.23333333333333334
541
+ aggregate,holistic,language_average,livebench_240701,kendall,somewhere_aggregate,5,9,0.6,0.23333333333333334
542
+ aggregate,holistic,if_average,livebench_240701,kendall,somewhere_aggregate,5,0,0.0,1.0
543
+ aggregate,holistic,if_average,livebench_240701,kendall,somewhere_aggregate,5,1,-0.6,0.23333333333333334
544
+ aggregate,holistic,if_average,livebench_240701,kendall,somewhere_aggregate,5,2,0.19999999999999998,0.8166666666666667
545
+ aggregate,holistic,if_average,livebench_240701,kendall,somewhere_aggregate,5,3,0.19999999999999998,0.8166666666666667
546
+ aggregate,holistic,if_average,livebench_240701,kendall,somewhere_aggregate,5,4,0.19999999999999998,0.8166666666666667
547
+ aggregate,holistic,if_average,livebench_240701,kendall,somewhere_aggregate,5,5,0.19999999999999998,0.8166666666666667
548
+ aggregate,holistic,if_average,livebench_240701,kendall,somewhere_aggregate,5,6,0.19999999999999998,0.8166666666666667
549
+ aggregate,holistic,if_average,livebench_240701,kendall,somewhere_aggregate,5,7,0.19999999999999998,0.8166666666666667
550
+ aggregate,holistic,if_average,livebench_240701,kendall,somewhere_aggregate,5,8,0.0,1.0
551
+ aggregate,holistic,if_average,livebench_240701,kendall,somewhere_aggregate,5,9,0.19999999999999998,0.8166666666666667
552
+ aggregate,holistic,arena_hard,arena_hard_2404,kendall,somewhere_aggregate,5,0,0.39999999999999997,0.48333333333333334
553
+ aggregate,holistic,arena_hard,arena_hard_2404,kendall,somewhere_aggregate,5,1,0.9999999999999999,0.016666666666666666
554
+ aggregate,holistic,arena_hard,arena_hard_2404,kendall,somewhere_aggregate,5,2,0.19999999999999998,0.8166666666666667
555
+ aggregate,holistic,arena_hard,arena_hard_2404,kendall,somewhere_aggregate,5,3,0.7999999999999999,0.08333333333333333
556
+ aggregate,holistic,arena_hard,arena_hard_2404,kendall,somewhere_aggregate,5,4,0.9999999999999999,0.016666666666666666
557
+ aggregate,holistic,arena_hard,arena_hard_2404,kendall,somewhere_aggregate,5,5,0.7999999999999999,0.08333333333333333
558
+ aggregate,holistic,arena_hard,arena_hard_2404,kendall,somewhere_aggregate,5,6,0.39999999999999997,0.48333333333333334
559
+ aggregate,holistic,arena_hard,arena_hard_2404,kendall,somewhere_aggregate,5,7,0.9999999999999999,0.016666666666666666
560
+ aggregate,holistic,arena_hard,arena_hard_2404,kendall,somewhere_aggregate,5,8,0.7999999999999999,0.08333333333333333
561
+ aggregate,holistic,arena_hard,arena_hard_2404,kendall,somewhere_aggregate,5,9,0.7999999999999999,0.08333333333333333
562
+ aggregate,holistic,mixeval,mixeval_240601,kendall,somewhere_aggregate,5,0,0.7999999999999999,0.08333333333333333
563
+ aggregate,holistic,mixeval,mixeval_240601,kendall,somewhere_aggregate,5,1,0.7999999999999999,0.08333333333333333
564
+ aggregate,holistic,mixeval,mixeval_240601,kendall,somewhere_aggregate,5,2,0.7999999999999999,0.08333333333333333
565
+ aggregate,holistic,mixeval,mixeval_240601,kendall,somewhere_aggregate,5,3,0.7999999999999999,0.08333333333333333
566
+ aggregate,holistic,mixeval,mixeval_240601,kendall,somewhere_aggregate,5,4,0.7999999999999999,0.08333333333333333
567
+ aggregate,holistic,mixeval,mixeval_240601,kendall,somewhere_aggregate,5,5,0.7999999999999999,0.08333333333333333
568
+ aggregate,holistic,mixeval,mixeval_240601,kendall,somewhere_aggregate,5,6,0.7999999999999999,0.08333333333333333
569
+ aggregate,holistic,mixeval,mixeval_240601,kendall,somewhere_aggregate,5,7,0.7999999999999999,0.08333333333333333
570
+ aggregate,holistic,mixeval,mixeval_240601,kendall,somewhere_aggregate,5,8,0.7999999999999999,0.08333333333333333
571
+ aggregate,holistic,mixeval,mixeval_240601,kendall,somewhere_aggregate,5,9,0.7999999999999999,0.08333333333333333
572
+ aggregate,holistic,agieval,BLZ_240312,kendall,somewhere_aggregate,5,0,0.31622776601683794,0.44848886103153174
573
+ aggregate,holistic,agieval,BLZ_240312,kendall,somewhere_aggregate,5,1,0.31622776601683794,0.44848886103153174
574
+ aggregate,holistic,agieval,BLZ_240312,kendall,somewhere_aggregate,5,2,0.39999999999999997,0.48333333333333334
575
+ aggregate,holistic,agieval,BLZ_240312,kendall,somewhere_aggregate,5,3,0.7999999999999999,0.08333333333333333
576
+ aggregate,holistic,agieval,BLZ_240312,kendall,somewhere_aggregate,5,4,0.7999999999999999,0.08333333333333333
577
+ aggregate,holistic,agieval,BLZ_240312,kendall,somewhere_aggregate,5,5,0.9999999999999999,0.016666666666666666
578
+ aggregate,holistic,agieval,BLZ_240312,kendall,somewhere_aggregate,5,6,0.19999999999999998,0.8166666666666667
579
+ aggregate,holistic,agieval,BLZ_240312,kendall,somewhere_aggregate,5,7,0.19999999999999998,0.8166666666666667
580
+ aggregate,holistic,agieval,BLZ_240312,kendall,somewhere_aggregate,5,8,0.39999999999999997,0.48333333333333334
581
+ aggregate,holistic,agieval,BLZ_240312,kendall,somewhere_aggregate,5,9,0.7999999999999999,0.08333333333333333
582
+ aggregate,holistic,arc_c,BLZ_240312,kendall,somewhere_aggregate,5,0,0.6,0.23333333333333334
583
+ aggregate,holistic,arc_c,BLZ_240312,kendall,somewhere_aggregate,5,1,0.39999999999999997,0.48333333333333334
584
+ aggregate,holistic,arc_c,BLZ_240312,kendall,somewhere_aggregate,5,2,-0.19999999999999998,0.8166666666666667
585
+ aggregate,holistic,arc_c,BLZ_240312,kendall,somewhere_aggregate,5,3,-0.39999999999999997,0.48333333333333334
586
+ aggregate,holistic,arc_c,BLZ_240312,kendall,somewhere_aggregate,5,4,0.0,1.0
587
+ aggregate,holistic,arc_c,BLZ_240312,kendall,somewhere_aggregate,5,5,0.6,0.23333333333333334
588
+ aggregate,holistic,arc_c,BLZ_240312,kendall,somewhere_aggregate,5,6,0.31622776601683794,0.44848886103153174
589
+ aggregate,holistic,arc_c,BLZ_240312,kendall,somewhere_aggregate,5,7,0.6,0.23333333333333334
590
+ aggregate,holistic,arc_c,BLZ_240312,kendall,somewhere_aggregate,5,8,0.39999999999999997,0.48333333333333334
591
+ aggregate,holistic,arc_c,BLZ_240312,kendall,somewhere_aggregate,5,9,0.7378647873726218,0.07697417298126676
592
+ aggregate,holistic,alpacav1,BLZ_240312,kendall,somewhere_aggregate,5,0,-0.31622776601683794,0.44848886103153174
593
+ aggregate,holistic,alpacav1,BLZ_240312,kendall,somewhere_aggregate,5,1,-0.7999999999999999,0.08333333333333333
594
+ aggregate,holistic,alpacav1,BLZ_240312,kendall,somewhere_aggregate,5,2,0.7999999999999999,0.08333333333333333
595
+ aggregate,holistic,alpacav1,BLZ_240312,kendall,somewhere_aggregate,5,3,0.7999999999999999,0.08333333333333333
596
+ aggregate,holistic,alpacav1,BLZ_240312,kendall,somewhere_aggregate,5,4,0.39999999999999997,0.48333333333333334
597
+ aggregate,holistic,alpacav1,BLZ_240312,kendall,somewhere_aggregate,5,5,0.39999999999999997,0.48333333333333334
598
+ aggregate,holistic,alpacav1,BLZ_240312,kendall,somewhere_aggregate,5,6,0.6,0.23333333333333334
599
+ aggregate,holistic,alpacav1,BLZ_240312,kendall,somewhere_aggregate,5,7,0.7378647873726218,0.07697417298126676
600
+ aggregate,holistic,alpacav1,BLZ_240312,kendall,somewhere_aggregate,5,8,0.6,0.23333333333333334
601
+ aggregate,holistic,alpacav1,BLZ_240312,kendall,somewhere_aggregate,5,9,0.31622776601683794,0.44848886103153174
602
+ aggregate,holistic,alpacav2,BLZ_240312,kendall,somewhere_aggregate,5,0,0.39999999999999997,0.48333333333333334
603
+ aggregate,holistic,alpacav2,BLZ_240312,kendall,somewhere_aggregate,5,1,0.19999999999999998,0.8166666666666667
604
+ aggregate,holistic,alpacav2,BLZ_240312,kendall,somewhere_aggregate,5,2,0.39999999999999997,0.48333333333333334
605
+ aggregate,holistic,alpacav2,BLZ_240312,kendall,somewhere_aggregate,5,3,0.7378647873726218,0.07697417298126676
606
+ aggregate,holistic,alpacav2,BLZ_240312,kendall,somewhere_aggregate,5,4,0.39999999999999997,0.48333333333333334
607
+ aggregate,holistic,alpacav2,BLZ_240312,kendall,somewhere_aggregate,5,5,-0.6,0.23333333333333334
608
+ aggregate,holistic,alpacav2,BLZ_240312,kendall,somewhere_aggregate,5,6,-0.39999999999999997,0.48333333333333334
609
+ aggregate,holistic,alpacav2,BLZ_240312,kendall,somewhere_aggregate,5,7,0.6,0.23333333333333334
610
+ aggregate,holistic,alpacav2,BLZ_240312,kendall,somewhere_aggregate,5,8,0.0,1.0
611
+ aggregate,holistic,alpacav2,BLZ_240312,kendall,somewhere_aggregate,5,9,0.6,0.23333333333333334
612
+ aggregate,holistic,alpacaeval2_lc,BLZ_240312,kendall,somewhere_aggregate,5,0,0.0,1.0
613
+ aggregate,holistic,alpacaeval2_lc,BLZ_240312,kendall,somewhere_aggregate,5,1,-0.6,0.23333333333333334
614
+ aggregate,holistic,alpacaeval2_lc,BLZ_240312,kendall,somewhere_aggregate,5,2,0.7999999999999999,0.08333333333333333
615
+ aggregate,holistic,alpacaeval2_lc,BLZ_240312,kendall,somewhere_aggregate,5,3,0.0,1.0
616
+ aggregate,holistic,alpacaeval2_lc,BLZ_240312,kendall,somewhere_aggregate,5,4,0.6,0.23333333333333334
617
+ aggregate,holistic,alpacaeval2_lc,BLZ_240312,kendall,somewhere_aggregate,5,5,0.39999999999999997,0.48333333333333334
618
+ aggregate,holistic,alpacaeval2_lc,BLZ_240312,kendall,somewhere_aggregate,5,6,0.39999999999999997,0.48333333333333334
619
+ aggregate,holistic,alpacaeval2_lc,BLZ_240312,kendall,somewhere_aggregate,5,7,0.6,0.23333333333333334
620
+ aggregate,holistic,alpacaeval2_lc,BLZ_240312,kendall,somewhere_aggregate,5,8,-0.39999999999999997,0.48333333333333334
621
+ aggregate,holistic,alpacaeval2_lc,BLZ_240312,kendall,somewhere_aggregate,5,9,0.0,1.0
622
+ aggregate,holistic,arena_elo,BLZ_240312,kendall,somewhere_aggregate,5,0,0.31622776601683794,0.44848886103153174
623
+ aggregate,holistic,arena_elo,BLZ_240312,kendall,somewhere_aggregate,5,1,0.6,0.23333333333333334
624
+ aggregate,holistic,arena_elo,BLZ_240312,kendall,somewhere_aggregate,5,2,0.31622776601683794,0.44848886103153174
625
+ aggregate,holistic,arena_elo,BLZ_240312,kendall,somewhere_aggregate,5,3,-0.9999999999999999,0.016666666666666666
626
+ aggregate,holistic,arena_elo,BLZ_240312,kendall,somewhere_aggregate,5,4,0.39999999999999997,0.48333333333333334
627
+ aggregate,holistic,arena_elo,BLZ_240312,kendall,somewhere_aggregate,5,5,0.31622776601683794,0.44848886103153174
628
+ aggregate,holistic,arena_elo,BLZ_240312,kendall,somewhere_aggregate,5,6,-0.9999999999999999,0.016666666666666666
629
+ aggregate,holistic,arena_elo,BLZ_240312,kendall,somewhere_aggregate,5,7,0.6,0.23333333333333334
630
+ aggregate,holistic,arena_elo,BLZ_240312,kendall,somewhere_aggregate,5,8,-0.10540925533894598,0.8005421074231263
631
+ aggregate,holistic,arena_elo,BLZ_240312,kendall,somewhere_aggregate,5,9,0.6,0.23333333333333334
632
+ aggregate,holistic,bbh,BLZ_240312,kendall,somewhere_aggregate,5,0,0.9486832980505137,0.02297740150320607
633
+ aggregate,holistic,bbh,BLZ_240312,kendall,somewhere_aggregate,5,1,0.31622776601683794,0.44848886103153174
634
+ aggregate,holistic,bbh,BLZ_240312,kendall,somewhere_aggregate,5,2,-0.39999999999999997,0.48333333333333334
635
+ aggregate,holistic,bbh,BLZ_240312,kendall,somewhere_aggregate,5,3,0.9999999999999999,0.016666666666666666
636
+ aggregate,holistic,bbh,BLZ_240312,kendall,somewhere_aggregate,5,4,0.0,1.0
637
+ aggregate,holistic,bbh,BLZ_240312,kendall,somewhere_aggregate,5,5,-0.39999999999999997,0.48333333333333334
638
+ aggregate,holistic,bbh,BLZ_240312,kendall,somewhere_aggregate,5,6,0.7999999999999999,0.08333333333333333
639
+ aggregate,holistic,bbh,BLZ_240312,kendall,somewhere_aggregate,5,7,0.7999999999999999,0.08333333333333333
640
+ aggregate,holistic,bbh,BLZ_240312,kendall,somewhere_aggregate,5,8,0.5270462766947298,0.206507295485425
641
+ aggregate,holistic,bbh,BLZ_240312,kendall,somewhere_aggregate,5,9,0.5270462766947298,0.206507295485425
642
+ aggregate,holistic,eq_benchv2,BLZ_240312,kendall,somewhere_aggregate,5,0,-0.19999999999999998,0.8166666666666667
643
+ aggregate,holistic,eq_benchv2,BLZ_240312,kendall,somewhere_aggregate,5,1,0.19999999999999998,0.8166666666666667
644
+ aggregate,holistic,eq_benchv2,BLZ_240312,kendall,somewhere_aggregate,5,2,0.0,1.0
645
+ aggregate,holistic,eq_benchv2,BLZ_240312,kendall,somewhere_aggregate,5,3,0.0,1.0
646
+ aggregate,holistic,eq_benchv2,BLZ_240312,kendall,somewhere_aggregate,5,4,-0.39999999999999997,0.48333333333333334
647
+ aggregate,holistic,eq_benchv2,BLZ_240312,kendall,somewhere_aggregate,5,5,0.39999999999999997,0.48333333333333334
648
+ aggregate,holistic,eq_benchv2,BLZ_240312,kendall,somewhere_aggregate,5,6,0.39999999999999997,0.48333333333333334
649
+ aggregate,holistic,eq_benchv2,BLZ_240312,kendall,somewhere_aggregate,5,7,0.7999999999999999,0.08333333333333333
650
+ aggregate,holistic,eq_benchv2,BLZ_240312,kendall,somewhere_aggregate,5,8,0.19999999999999998,0.8166666666666667
651
+ aggregate,holistic,eq_benchv2,BLZ_240312,kendall,somewhere_aggregate,5,9,0.7999999999999999,0.08333333333333333
652
+ aggregate,holistic,gpt4all,BLZ_240312,kendall,somewhere_aggregate,5,0,0.7378647873726218,0.07697417298126676
653
+ aggregate,holistic,gpt4all,BLZ_240312,kendall,somewhere_aggregate,5,1,0.39999999999999997,0.48333333333333334
654
+ aggregate,holistic,gpt4all,BLZ_240312,kendall,somewhere_aggregate,5,2,0.31622776601683794,0.44848886103153174
655
+ aggregate,holistic,gpt4all,BLZ_240312,kendall,somewhere_aggregate,5,3,0.39999999999999997,0.48333333333333334
656
+ aggregate,holistic,gpt4all,BLZ_240312,kendall,somewhere_aggregate,5,4,0.31622776601683794,0.44848886103153174
657
+ aggregate,holistic,gpt4all,BLZ_240312,kendall,somewhere_aggregate,5,5,0.39999999999999997,0.48333333333333334
658
+ aggregate,holistic,gpt4all,BLZ_240312,kendall,somewhere_aggregate,5,6,-0.39999999999999997,0.48333333333333334
659
+ aggregate,holistic,gpt4all,BLZ_240312,kendall,somewhere_aggregate,5,7,-0.39999999999999997,0.48333333333333334
660
+ aggregate,holistic,gpt4all,BLZ_240312,kendall,somewhere_aggregate,5,8,0.39999999999999997,0.48333333333333334
661
+ aggregate,holistic,gpt4all,BLZ_240312,kendall,somewhere_aggregate,5,9,0.5270462766947298,0.206507295485425
662
+ aggregate,holistic,hugging_6,BLZ_240312,kendall,somewhere_aggregate,5,0,-0.6,0.23333333333333334
663
+ aggregate,holistic,hugging_6,BLZ_240312,kendall,somewhere_aggregate,5,1,-0.6,0.23333333333333334
664
+ aggregate,holistic,hugging_6,BLZ_240312,kendall,somewhere_aggregate,5,2,0.19999999999999998,0.8166666666666667
665
+ aggregate,holistic,hugging_6,BLZ_240312,kendall,somewhere_aggregate,5,3,0.39999999999999997,0.48333333333333334
666
+ aggregate,holistic,hugging_6,BLZ_240312,kendall,somewhere_aggregate,5,4,0.19999999999999998,0.8166666666666667
667
+ aggregate,holistic,hugging_6,BLZ_240312,kendall,somewhere_aggregate,5,5,0.6,0.23333333333333334
668
+ aggregate,holistic,hugging_6,BLZ_240312,kendall,somewhere_aggregate,5,6,-0.39999999999999997,0.48333333333333334
669
+ aggregate,holistic,hugging_6,BLZ_240312,kendall,somewhere_aggregate,5,7,-0.6,0.23333333333333334
670
+ aggregate,holistic,hugging_6,BLZ_240312,kendall,somewhere_aggregate,5,8,-0.39999999999999997,0.48333333333333334
671
+ aggregate,holistic,hugging_6,BLZ_240312,kendall,somewhere_aggregate,5,9,0.6,0.23333333333333334
672
+ aggregate,holistic,llmonitor,BLZ_240312,kendall,somewhere_aggregate,5,0,0.11952286093343936,0.7815112949987133
673
+ aggregate,holistic,llmonitor,BLZ_240312,kendall,somewhere_aggregate,5,1,0.0,1.0
674
+ aggregate,holistic,llmonitor,BLZ_240312,kendall,somewhere_aggregate,5,2,0.0,1.0
675
+ aggregate,holistic,llmonitor,BLZ_240312,kendall,somewhere_aggregate,5,3,0.6,0.23333333333333334
676
+ aggregate,holistic,llmonitor,BLZ_240312,kendall,somewhere_aggregate,5,4,0.39999999999999997,0.48333333333333334
677
+ aggregate,holistic,llmonitor,BLZ_240312,kendall,somewhere_aggregate,5,5,0.6,0.23333333333333334
678
+ aggregate,holistic,llmonitor,BLZ_240312,kendall,somewhere_aggregate,5,6,0.35856858280031806,0.40538055645894233
679
+ aggregate,holistic,llmonitor,BLZ_240312,kendall,somewhere_aggregate,5,7,0.31622776601683794,0.44848886103153174
680
+ aggregate,holistic,llmonitor,BLZ_240312,kendall,somewhere_aggregate,5,8,0.0,1.0
681
+ aggregate,holistic,llmonitor,BLZ_240312,kendall,somewhere_aggregate,5,9,0.5270462766947298,0.206507295485425
682
+ aggregate,holistic,magi,BLZ_240312,kendall,somewhere_aggregate,5,0,0.39999999999999997,0.48333333333333334
683
+ aggregate,holistic,magi,BLZ_240312,kendall,somewhere_aggregate,5,1,0.19999999999999998,0.8166666666666667
684
+ aggregate,holistic,magi,BLZ_240312,kendall,somewhere_aggregate,5,2,-0.10540925533894598,0.8005421074231263
685
+ aggregate,holistic,magi,BLZ_240312,kendall,somewhere_aggregate,5,3,0.39999999999999997,0.48333333333333334
686
+ aggregate,holistic,magi,BLZ_240312,kendall,somewhere_aggregate,5,4,0.6,0.23333333333333334
687
+ aggregate,holistic,magi,BLZ_240312,kendall,somewhere_aggregate,5,5,0.7999999999999999,0.08333333333333333
688
+ aggregate,holistic,magi,BLZ_240312,kendall,somewhere_aggregate,5,6,0.6,0.23333333333333334
689
+ aggregate,holistic,magi,BLZ_240312,kendall,somewhere_aggregate,5,7,0.7999999999999999,0.08333333333333333
690
+ aggregate,holistic,magi,BLZ_240312,kendall,somewhere_aggregate,5,8,0.19999999999999998,0.8166666666666667
691
+ aggregate,holistic,magi,BLZ_240312,kendall,somewhere_aggregate,5,9,-0.19999999999999998,0.8166666666666667
692
+ aggregate,holistic,mmlu,BLZ_240312,kendall,somewhere_aggregate,5,0,0.6,0.23333333333333334
693
+ aggregate,holistic,mmlu,BLZ_240312,kendall,somewhere_aggregate,5,1,0.39999999999999997,0.48333333333333334
694
+ aggregate,holistic,mmlu,BLZ_240312,kendall,somewhere_aggregate,5,2,0.0,1.0
695
+ aggregate,holistic,mmlu,BLZ_240312,kendall,somewhere_aggregate,5,3,0.39999999999999997,0.48333333333333334
696
+ aggregate,holistic,mmlu,BLZ_240312,kendall,somewhere_aggregate,5,4,0.6,0.23333333333333334
697
+ aggregate,holistic,mmlu,BLZ_240312,kendall,somewhere_aggregate,5,5,0.6,0.23333333333333334
698
+ aggregate,holistic,mmlu,BLZ_240312,kendall,somewhere_aggregate,5,6,0.6,0.23333333333333334
699
+ aggregate,holistic,mmlu,BLZ_240312,kendall,somewhere_aggregate,5,7,0.6,0.23333333333333334
700
+ aggregate,holistic,mmlu,BLZ_240312,kendall,somewhere_aggregate,5,8,0.6,0.23333333333333334
701
+ aggregate,holistic,mmlu,BLZ_240312,kendall,somewhere_aggregate,5,9,0.5270462766947298,0.206507295485425
702
+ aggregate,holistic,mt_bench,BLZ_240312,kendall,somewhere_aggregate,5,0,0.6,0.23333333333333334
703
+ aggregate,holistic,mt_bench,BLZ_240312,kendall,somewhere_aggregate,5,1,0.0,1.0
704
+ aggregate,holistic,mt_bench,BLZ_240312,kendall,somewhere_aggregate,5,2,0.10540925533894598,0.8005421074231263
705
+ aggregate,holistic,mt_bench,BLZ_240312,kendall,somewhere_aggregate,5,3,0.39999999999999997,0.48333333333333334
706
+ aggregate,holistic,mt_bench,BLZ_240312,kendall,somewhere_aggregate,5,4,0.39999999999999997,0.48333333333333334
707
+ aggregate,holistic,mt_bench,BLZ_240312,kendall,somewhere_aggregate,5,5,0.19999999999999998,0.8166666666666667
708
+ aggregate,holistic,mt_bench,BLZ_240312,kendall,somewhere_aggregate,5,6,0.10540925533894598,0.8005421074231263
709
+ aggregate,holistic,mt_bench,BLZ_240312,kendall,somewhere_aggregate,5,7,-0.39999999999999997,0.48333333333333334
710
+ aggregate,holistic,mt_bench,BLZ_240312,kendall,somewhere_aggregate,5,8,0.19999999999999998,0.8166666666666667
711
+ aggregate,holistic,mt_bench,BLZ_240312,kendall,somewhere_aggregate,5,9,0.39999999999999997,0.48333333333333334
712
+ aggregate,holistic,biggen_mwr,biggen_240612,kendall,somewhere_aggregate,5,0,0.39999999999999997,0.48333333333333334
713
+ aggregate,holistic,biggen_mwr,biggen_240612,kendall,somewhere_aggregate,5,1,0.39999999999999997,0.48333333333333334
714
+ aggregate,holistic,biggen_mwr,biggen_240612,kendall,somewhere_aggregate,5,2,0.19999999999999998,0.8166666666666667
715
+ aggregate,holistic,biggen_mwr,biggen_240612,kendall,somewhere_aggregate,5,3,0.6,0.23333333333333334
716
+ aggregate,holistic,biggen_mwr,biggen_240612,kendall,somewhere_aggregate,5,4,0.6,0.23333333333333334
717
+ aggregate,holistic,biggen_mwr,biggen_240612,kendall,somewhere_aggregate,5,5,0.39999999999999997,0.48333333333333334
718
+ aggregate,holistic,biggen_mwr,biggen_240612,kendall,somewhere_aggregate,5,6,0.39999999999999997,0.48333333333333334
719
+ aggregate,holistic,biggen_mwr,biggen_240612,kendall,somewhere_aggregate,5,7,0.39999999999999997,0.48333333333333334
720
+ aggregate,holistic,biggen_mwr,biggen_240612,kendall,somewhere_aggregate,5,8,0.6,0.23333333333333334
721
+ aggregate,holistic,biggen_mwr,biggen_240612,kendall,somewhere_aggregate,5,9,0.7999999999999999,0.08333333333333333
cache/agreements_cache_facdc1028ee0edd9aed491afc51b884d.csv ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ scenario,scenario_source,ref_scenario,ref_source,corr_type,model_select_strategy,model_subset_size_requested,exp_n,correlation,p_value
2
+ hellaswag,BLZ_240312,aggregate,holistic,kendall,top_aggregate,5,0,0.9999999999999999,0.016666666666666666
3
+ humaneval,BLZ_240312,aggregate,holistic,kendall,top_aggregate,5,0,0.31622776601683794,0.44848886103153174
4
+ mbpp,BLZ_240312,aggregate,holistic,kendall,top_aggregate,5,0,0.31622776601683794,0.44848886103153174
5
+ winogrande,BLZ_240312,aggregate,holistic,kendall,top_aggregate,5,0,0.19999999999999998,0.8166666666666667
6
+ grounding,biggen_240612,aggregate,holistic,kendall,top_aggregate,5,0,0.7378647873726218,0.07697417298126676
7
+ instruction_following,biggen_240612,aggregate,holistic,kendall,top_aggregate,5,0,0.7378647873726218,0.07697417298126676
8
+ planning,biggen_240612,aggregate,holistic,kendall,top_aggregate,5,0,0.39999999999999997,0.48333333333333334
9
+ reasoning,biggen_240612,aggregate,holistic,kendall,top_aggregate,5,0,0.9999999999999999,0.016666666666666666
10
+ refinement,biggen_240612,aggregate,holistic,kendall,top_aggregate,5,0,0.19999999999999998,0.8166666666666667
11
+ safety,biggen_240612,aggregate,holistic,kendall,top_aggregate,5,0,0.19999999999999998,0.8166666666666667
12
+ theory_of_mind,biggen_240612,aggregate,holistic,kendall,top_aggregate,5,0,0.7999999999999999,0.08333333333333333
13
+ tool_usage,biggen_240612,aggregate,holistic,kendall,top_aggregate,5,0,0.39999999999999997,0.48333333333333334
14
+ livebench_average,livebench_240701,aggregate,holistic,kendall,top_aggregate,5,0,0.19999999999999998,0.8166666666666667
15
+ reasoning_average,livebench_240701,aggregate,holistic,kendall,top_aggregate,5,0,0.39999999999999997,0.48333333333333334
16
+ coding_average,livebench_240701,aggregate,holistic,kendall,top_aggregate,5,0,0.19999999999999998,0.8166666666666667
17
+ mathematics_average,livebench_240701,aggregate,holistic,kendall,top_aggregate,5,0,0.19999999999999998,0.8166666666666667
18
+ data_analysis_average,livebench_240701,aggregate,holistic,kendall,top_aggregate,5,0,0.6,0.23333333333333334
19
+ language_average,livebench_240701,aggregate,holistic,kendall,top_aggregate,5,0,0.39999999999999997,0.48333333333333334
20
+ if_average,livebench_240701,aggregate,holistic,kendall,top_aggregate,5,0,0.39999999999999997,0.48333333333333334
21
+ arena_hard,arena_hard_2404,aggregate,holistic,kendall,top_aggregate,5,0,0.9999999999999999,0.016666666666666666
22
+ mixeval,mixeval_240601,aggregate,holistic,kendall,top_aggregate,5,0,0.7999999999999999,0.08333333333333333
23
+ agieval,BLZ_240312,aggregate,holistic,kendall,top_aggregate,5,0,0.5270462766947298,0.206507295485425
24
+ arc_c,BLZ_240312,aggregate,holistic,kendall,top_aggregate,5,0,0.6,0.23333333333333334
25
+ alpacav1,BLZ_240312,aggregate,holistic,kendall,top_aggregate,5,0,0.5270462766947298,0.206507295485425
26
+ alpacav2,BLZ_240312,aggregate,holistic,kendall,top_aggregate,5,0,0.6,0.23333333333333334
27
+ alpacaeval2_lc,BLZ_240312,aggregate,holistic,kendall,top_aggregate,5,0,0.31622776601683794,0.44848886103153174
28
+ arena_elo,BLZ_240312,aggregate,holistic,kendall,top_aggregate,5,0,0.9999999999999999,0.016666666666666666
29
+ bbh,BLZ_240312,aggregate,holistic,kendall,top_aggregate,5,0,0.5270462766947298,0.206507295485425
30
+ eq_benchv2,BLZ_240312,aggregate,holistic,kendall,top_aggregate,5,0,0.9999999999999999,0.016666666666666666
31
+ gpt4all,BLZ_240312,aggregate,holistic,kendall,top_aggregate,5,0,0.0,1.0
32
+ hugging_6,BLZ_240312,aggregate,holistic,kendall,top_aggregate,5,0,0.39999999999999997,0.48333333333333334
33
+ llmonitor,BLZ_240312,aggregate,holistic,kendall,top_aggregate,5,0,0.6,0.23333333333333334
34
+ magi,BLZ_240312,aggregate,holistic,kendall,top_aggregate,5,0,0.31622776601683794,0.44848886103153174
35
+ mmlu,BLZ_240312,aggregate,holistic,kendall,top_aggregate,5,0,0.19999999999999998,0.8166666666666667
36
+ mt_bench,BLZ_240312,aggregate,holistic,kendall,top_aggregate,5,0,0.6,0.23333333333333334
37
+ biggen_mwr,biggen_240612,aggregate,holistic,kendall,top_aggregate,5,0,0.39999999999999997,0.48333333333333334
38
+ aggregate,holistic,hellaswag,BLZ_240312,kendall,top_aggregate,5,0,0.9999999999999999,0.016666666666666666
39
+ aggregate,holistic,humaneval,BLZ_240312,kendall,top_aggregate,5,0,0.31622776601683794,0.44848886103153174
40
+ aggregate,holistic,mbpp,BLZ_240312,kendall,top_aggregate,5,0,0.31622776601683794,0.44848886103153174
41
+ aggregate,holistic,winogrande,BLZ_240312,kendall,top_aggregate,5,0,0.19999999999999998,0.8166666666666667
42
+ aggregate,holistic,grounding,biggen_240612,kendall,top_aggregate,5,0,0.7378647873726218,0.07697417298126676
43
+ aggregate,holistic,instruction_following,biggen_240612,kendall,top_aggregate,5,0,0.7378647873726218,0.07697417298126676
44
+ aggregate,holistic,planning,biggen_240612,kendall,top_aggregate,5,0,0.39999999999999997,0.48333333333333334
45
+ aggregate,holistic,reasoning,biggen_240612,kendall,top_aggregate,5,0,0.9999999999999999,0.016666666666666666
46
+ aggregate,holistic,refinement,biggen_240612,kendall,top_aggregate,5,0,0.19999999999999998,0.8166666666666667
47
+ aggregate,holistic,safety,biggen_240612,kendall,top_aggregate,5,0,0.19999999999999998,0.8166666666666667
48
+ aggregate,holistic,theory_of_mind,biggen_240612,kendall,top_aggregate,5,0,0.7999999999999999,0.08333333333333333
49
+ aggregate,holistic,tool_usage,biggen_240612,kendall,top_aggregate,5,0,0.39999999999999997,0.48333333333333334
50
+ aggregate,holistic,livebench_average,livebench_240701,kendall,top_aggregate,5,0,0.19999999999999998,0.8166666666666667
51
+ aggregate,holistic,reasoning_average,livebench_240701,kendall,top_aggregate,5,0,0.39999999999999997,0.48333333333333334
52
+ aggregate,holistic,coding_average,livebench_240701,kendall,top_aggregate,5,0,0.19999999999999998,0.8166666666666667
53
+ aggregate,holistic,mathematics_average,livebench_240701,kendall,top_aggregate,5,0,0.19999999999999998,0.8166666666666667
54
+ aggregate,holistic,data_analysis_average,livebench_240701,kendall,top_aggregate,5,0,0.6,0.23333333333333334
55
+ aggregate,holistic,language_average,livebench_240701,kendall,top_aggregate,5,0,0.39999999999999997,0.48333333333333334
56
+ aggregate,holistic,if_average,livebench_240701,kendall,top_aggregate,5,0,0.39999999999999997,0.48333333333333334
57
+ aggregate,holistic,arena_hard,arena_hard_2404,kendall,top_aggregate,5,0,0.9999999999999999,0.016666666666666666
58
+ aggregate,holistic,mixeval,mixeval_240601,kendall,top_aggregate,5,0,0.7999999999999999,0.08333333333333333
59
+ aggregate,holistic,agieval,BLZ_240312,kendall,top_aggregate,5,0,0.5270462766947298,0.206507295485425
60
+ aggregate,holistic,arc_c,BLZ_240312,kendall,top_aggregate,5,0,0.6,0.23333333333333334
61
+ aggregate,holistic,alpacav1,BLZ_240312,kendall,top_aggregate,5,0,0.5270462766947298,0.206507295485425
62
+ aggregate,holistic,alpacav2,BLZ_240312,kendall,top_aggregate,5,0,0.6,0.23333333333333334
63
+ aggregate,holistic,alpacaeval2_lc,BLZ_240312,kendall,top_aggregate,5,0,0.31622776601683794,0.44848886103153174
64
+ aggregate,holistic,arena_elo,BLZ_240312,kendall,top_aggregate,5,0,0.9999999999999999,0.016666666666666666
65
+ aggregate,holistic,bbh,BLZ_240312,kendall,top_aggregate,5,0,0.5270462766947298,0.206507295485425
66
+ aggregate,holistic,eq_benchv2,BLZ_240312,kendall,top_aggregate,5,0,0.9999999999999999,0.016666666666666666
67
+ aggregate,holistic,gpt4all,BLZ_240312,kendall,top_aggregate,5,0,0.0,1.0
68
+ aggregate,holistic,hugging_6,BLZ_240312,kendall,top_aggregate,5,0,0.39999999999999997,0.48333333333333334
69
+ aggregate,holistic,llmonitor,BLZ_240312,kendall,top_aggregate,5,0,0.6,0.23333333333333334
70
+ aggregate,holistic,magi,BLZ_240312,kendall,top_aggregate,5,0,0.31622776601683794,0.44848886103153174
71
+ aggregate,holistic,mmlu,BLZ_240312,kendall,top_aggregate,5,0,0.19999999999999998,0.8166666666666667
72
+ aggregate,holistic,mt_bench,BLZ_240312,kendall,top_aggregate,5,0,0.6,0.23333333333333334
73
+ aggregate,holistic,biggen_mwr,biggen_240612,kendall,top_aggregate,5,0,0.39999999999999997,0.48333333333333334
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ git+https://github.com/IBM/benchbench.git
2
+ streamlit
3
+ plotly