scenario,scenario_source,ref_scenario,ref_source,corr_type,model_select_strategy,model_subset_size_requested,exp_n,correlation,p_value Holmes,holmes_240829.csv,aggregate,aggregate,kendall,random,8,0,-0.14285714285714285,0.7195436507936508 Holmes,holmes_240829.csv,aggregate,aggregate,kendall,random,8,1,-0.14285714285714285,0.7195436507936508 Holmes,holmes_240829.csv,aggregate,aggregate,kendall,random,8,2,-0.2857142857142857,0.39875992063492066 Holmes,holmes_240829.csv,aggregate,aggregate,kendall,random,8,3,-0.21428571428571427,0.5484126984126985 Holmes,holmes_240829.csv,aggregate,aggregate,kendall,random,8,4,-0.07142857142857142,0.9048611111111111 Helm Lite,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,0,0.5714285714285714,0.06101190476190476 Helm Lite,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,1,0.6182840223353117,0.0340492747686748 Helm Lite,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,2,0.6428571428571428,0.03115079365079365 Helm Lite,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,3,0.8571428571428571,0.001736111111111111 Helm Lite,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,4,0.9285714285714285,0.0003968253968253968 Helm Lite NarrativeQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,0,-0.07142857142857142,0.9048611111111111 Helm Lite NarrativeQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,1,-0.3571428571428571,0.27509920634920637 Helm Lite NarrativeQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,2,-0.07142857142857142,0.9048611111111111 Helm Lite NarrativeQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,3,0.21428571428571427,0.5484126984126985 Helm Lite NarrativeQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,4,-0.3571428571428571,0.27509920634920637 Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,0,0.2857142857142857,0.39875992063492066 Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,1,0.0,1.0 Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,2,0.3571428571428571,0.27509920634920637 Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,3,0.14285714285714285,0.7195436507936508 Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,4,0.2857142857142857,0.39875992063492066 Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,0,0.5714285714285714,0.06101190476190476 Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,1,0.2857142857142857,0.39875992063492066 Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,2,0.3571428571428571,0.27509920634920637 Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,3,0.42857142857142855,0.17886904761904762 Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,4,0.5714285714285714,0.06101190476190476 Helm Lite OpenBookQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,0,0.5714285714285714,0.06101190476190476 Helm Lite OpenBookQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,1,0.9285714285714285,0.0003968253968253968 Helm Lite OpenBookQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,2,0.4999999999999999,0.10868055555555556 Helm Lite OpenBookQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,3,0.9285714285714285,0.0003968253968253968 Helm Lite OpenBookQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,4,0.8571428571428571,0.001736111111111111 Helm Lite MMLU,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,0,0.5714285714285714,0.06101190476190476 Helm Lite MMLU,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7142857142857142,0.014136904761904762 Helm Lite MMLU,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,2,0.4999999999999999,0.10868055555555556 Helm Lite MMLU,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,3,0.7142857142857142,0.014136904761904762 Helm Lite MMLU,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,4,0.9999999999999998,4.96031746031746e-05 Helm Lite MathEquivalentCOT,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,0,0.6428571428571428,0.03115079365079365 Helm Lite MathEquivalentCOT,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,1,0.4999999999999999,0.10868055555555556 Helm Lite MathEquivalentCOT,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,2,0.4999999999999999,0.10868055555555556 Helm Lite MathEquivalentCOT,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,3,0.5714285714285714,0.06101190476190476 Helm Lite MathEquivalentCOT,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,4,0.8571428571428571,0.001736111111111111 Helm Lite GSM8K,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7142857142857142,0.014136904761904762 Helm Lite GSM8K,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7142857142857142,0.014136904761904762 Helm Lite GSM8K,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,2,0.5714285714285714,0.06101190476190476 Helm Lite GSM8K,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,3,0.42857142857142855,0.17886904761904762 Helm Lite GSM8K,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,4,0.5714285714285714,0.06101190476190476 Helm Lite LegalBench,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7142857142857142,0.014136904761904762 Helm Lite LegalBench,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,1,0.42857142857142855,0.17886904761904762 Helm Lite LegalBench,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,2,0.5455447255899809,0.0614649096074132 Helm Lite LegalBench,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,3,0.7637626158259734,0.008839740160738534 Helm Lite LegalBench,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,4,0.7142857142857142,0.014136904761904762 Helm Lite MedQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,0,0.6428571428571428,0.03115079365079365 Helm Lite MedQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,1,0.5455447255899809,0.0614649096074132 Helm Lite MedQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,2,0.4999999999999999,0.10868055555555556 Helm Lite MedQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,3,0.7142857142857142,0.014136904761904762 Helm Lite MedQA,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,4,0.8571428571428571,0.001736111111111111 Helm Lite WMT2014,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,0,0.6428571428571428,0.03115079365079365 Helm Lite WMT2014,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,1,0.2857142857142857,0.39875992063492066 Helm Lite WMT2014,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,2,0.6428571428571428,0.03115079365079365 Helm Lite WMT2014,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,3,0.9285714285714285,0.0003968253968253968 Helm Lite WMT2014,helm_lite_240829.csv,aggregate,aggregate,kendall,random,8,4,0.9092412093166348,0.0018276750354536814 HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,0,0.3571428571428571,0.27509920634920637 HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,1,0.5714285714285714,0.06101190476190476 HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7857142857142856,0.005505952380952381 HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,3,0.7857142857142856,0.005505952380952381 HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,4,0.6428571428571428,0.03115079365079365 HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7142857142857142,0.014136904761904762 HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,1,0.5714285714285714,0.06101190476190476 HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,2,0.6428571428571428,0.03115079365079365 HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,3,0.9285714285714285,0.0003968253968253968 HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,4,0.7637626158259734,0.008839740160738534 HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,0,0.4999999999999999,0.10868055555555556 HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7857142857142856,0.005505952380952381 HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,2,0.3401680257083045,0.25175949861106117 HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,3,0.7857142857142856,0.005505952380952381 HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,4,0.7857142857142856,0.005505952380952381 HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,0,0.0,1.0 HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,1,0.14285714285714285,0.7195436507936508 HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,2,0.8571428571428571,0.001736111111111111 HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,3,0.42857142857142855,0.17886904761904762 HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,4,0.4999999999999999,0.10868055555555556 HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,0,0.6428571428571428,0.03115079365079365 HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7857142857142856,0.005505952380952381 HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,2,0.8571428571428571,0.001736111111111111 HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,3,0.8571428571428571,0.001736111111111111 HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,4,0.8571428571428571,0.001736111111111111 HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,0,0.21428571428571427,0.5484126984126985 HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,1,0.5455447255899809,0.0614649096074132 HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,2,0.4999999999999999,0.10868055555555556 HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,3,0.18184824186332696,0.5330356744917513 HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,4,0.2545875386086578,0.38281014365989596 HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,0,0.42857142857142855,0.17886904761904762 HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,1,0.10910894511799618,0.7083840532183997 HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,2,0.21428571428571427,0.5484126984126985 HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,3,0.21428571428571427,0.5484126984126985 HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,4,0.5714285714285714,0.06101190476190476 OpenCompass Academic,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,0,0.8571428571428571,0.001736111111111111 OpenCompass Academic,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7857142857142856,0.005505952380952381 OpenCompass Academic,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,2,0.8571428571428571,0.001736111111111111 OpenCompass Academic,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,3,0.9285714285714285,0.0003968253968253968 OpenCompass Academic,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,4,0.8571428571428571,0.001736111111111111 OpenCompass MMLU,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,0,0.5714285714285714,0.06101190476190476 OpenCompass MMLU,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,1,0.6428571428571428,0.03115079365079365 OpenCompass MMLU,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,2,0.9285714285714285,0.0003968253968253968 OpenCompass MMLU,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,3,0.8571428571428571,0.001736111111111111 OpenCompass MMLU,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,4,0.7857142857142856,0.005505952380952381 OpenCompass MMLU Pro,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,0,0.5714285714285714,0.06101190476190476 OpenCompass MMLU Pro,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7142857142857142,0.014136904761904762 OpenCompass MMLU Pro,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,2,0.9285714285714285,0.0003968253968253968 OpenCompass MMLU Pro,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,3,0.8571428571428571,0.001736111111111111 OpenCompass MMLU Pro,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,4,0.7142857142857142,0.014136904761904762 OpenCompass CMMLU,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7857142857142856,0.005505952380952381 OpenCompass CMMLU,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,1,0.5714285714285714,0.06101190476190476 OpenCompass CMMLU,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,2,0.6428571428571428,0.03115079365079365 OpenCompass CMMLU,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,3,0.7857142857142856,0.005505952380952381 OpenCompass CMMLU,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,4,0.7142857142857142,0.014136904761904762 OpenCompass BBH,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7142857142857142,0.014136904761904762 OpenCompass BBH,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,1,0.8571428571428571,0.001736111111111111 OpenCompass BBH,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,2,0.8571428571428571,0.001736111111111111 OpenCompass BBH,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,3,0.8571428571428571,0.001736111111111111 OpenCompass BBH,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,4,0.7857142857142856,0.005505952380952381 OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7142857142857142,0.014136904761904762 OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,1,0.6910233190806425,0.017844011512848347 OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,2,0.9999999999999998,4.96031746031746e-05 OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,3,0.42857142857142855,0.17886904761904762 OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,4,0.40006613209931935,0.17023995462900499 OpenCompass HumanEval,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,0,0.6428571428571428,0.03115079365079365 OpenCompass HumanEval,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,1,0.5929994533288809,0.04437842734548688 OpenCompass HumanEval,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,2,0.6910233190806425,0.017844011512848347 OpenCompass HumanEval,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,3,0.5455447255899809,0.0614649096074132 OpenCompass HumanEval,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,4,0.6910233190806425,0.017844011512848347 OpenCompass IFEval,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,0,0.6428571428571428,0.03115079365079365 OpenCompass IFEval,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,1,0.6428571428571428,0.03115079365079365 OpenCompass IFEval,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7857142857142856,0.005505952380952381 OpenCompass IFEval,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,3,0.5714285714285714,0.06101190476190476 OpenCompass IFEval,opencompass_academic_240829.csv,aggregate,aggregate,kendall,random,8,4,0.5714285714285714,0.06101190476190476 Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,kendall,random,8,0,0.9285714285714285,0.0003968253968253968 Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7142857142857142,0.014136904761904762 Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7857142857142856,0.005505952380952381 Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,kendall,random,8,3,0.5714285714285714,0.06101190476190476 Helm MMLU,helm_mmlu_240829.csv,aggregate,aggregate,kendall,random,8,4,0.42857142857142855,0.17886904761904762 LMSys Arena,chatbot_arena_240829.csv,aggregate,aggregate,kendall,random,8,0,0.8571428571428571,0.001736111111111111 LMSys Arena,chatbot_arena_240829.csv,aggregate,aggregate,kendall,random,8,1,0.8571428571428571,0.001736111111111111 LMSys Arena,chatbot_arena_240829.csv,aggregate,aggregate,kendall,random,8,2,0.8571428571428571,0.001736111111111111 LMSys Arena,chatbot_arena_240829.csv,aggregate,aggregate,kendall,random,8,3,0.8571428571428571,0.001736111111111111 LMSys Arena,chatbot_arena_240829.csv,aggregate,aggregate,kendall,random,8,4,0.7857142857142856,0.005505952380952381 Helm Classic,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,0,0.9285714285714285,0.0003968253968253968 Helm Classic,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,1,0.9285714285714285,0.0003968253968253968 Helm Classic,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,2,0.9285714285714285,0.0003968253968253968 Helm Classic,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,3,0.8571428571428571,0.001736111111111111 Helm Classic,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,4,0.9285714285714285,0.0003968253968253968 Helm BoolQ,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,0,0.5714285714285714,0.06101190476190476 Helm BoolQ,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,1,0.9285714285714285,0.0003968253968253968 Helm BoolQ,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7857142857142856,0.005505952380952381 Helm BoolQ,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,3,0.9285714285714285,0.0003968253968253968 Helm BoolQ,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,4,0.4999999999999999,0.10868055555555556 Helm NarrativeQA,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,0,0.42857142857142855,0.17886904761904762 Helm NarrativeQA,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7142857142857142,0.014136904761904762 Helm NarrativeQA,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,2,0.4999999999999999,0.10868055555555556 Helm NarrativeQA,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,3,0.5714285714285714,0.06101190476190476 Helm NarrativeQA,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,4,0.4999999999999999,0.10868055555555556 Helm NaturalQuestionsClosed,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,0,0.6428571428571428,0.03115079365079365 Helm NaturalQuestionsClosed,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7857142857142856,0.005505952380952381 Helm NaturalQuestionsClosed,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,2,0.6428571428571428,0.03115079365079365 Helm NaturalQuestionsClosed,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,3,0.7142857142857142,0.014136904761904762 Helm NaturalQuestionsClosed,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,4,0.7142857142857142,0.014136904761904762 Helm NaturalQuestionsOpen,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,0,0.6910233190806425,0.017844011512848347 Helm NaturalQuestionsOpen,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7142857142857142,0.014136904761904762 Helm NaturalQuestionsOpen,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7142857142857142,0.014136904761904762 Helm NaturalQuestionsOpen,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,3,0.8571428571428571,0.001736111111111111 Helm NaturalQuestionsOpen,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,4,0.7637626158259734,0.008839740160738534 Helm QuAC,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,0,0.21428571428571427,0.5484126984126985 Helm QuAC,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,1,0.3571428571428571,0.27509920634920637 Helm QuAC,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,2,0.4999999999999999,0.10868055555555556 Helm QuAC,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,3,0.9285714285714285,0.0003968253968253968 Helm QuAC,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,4,0.5714285714285714,0.06101190476190476 Helm OpenBookQA,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,0,0.8571428571428571,0.001736111111111111 Helm OpenBookQA,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,1,0.5714285714285714,0.06101190476190476 Helm OpenBookQA,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,2,0.07142857142857142,0.9048611111111111 Helm OpenBookQA,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,3,0.7142857142857142,0.014136904761904762 Helm OpenBookQA,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,4,0.6428571428571428,0.03115079365079365 Helm IMDB,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,0,0.21428571428571427,0.5484126984126985 Helm IMDB,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,1,0.42857142857142855,0.17886904761904762 Helm IMDB,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,2,0.3706246583305506,0.20891238174069848 Helm IMDB,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,3,0.3571428571428571,0.27509920634920637 Helm IMDB,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,4,0.40006613209931935,0.17023995462900499 Helm CivilComments,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,0,0.5714285714285714,0.06101190476190476 Helm CivilComments,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7142857142857142,0.014136904761904762 Helm CivilComments,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,2,0.42857142857142855,0.17886904761904762 Helm CivilComments,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,3,0.6428571428571428,0.03115079365079365 Helm CivilComments,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,4,0.8571428571428571,0.001736111111111111 Helm RAFT,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,0,0.6428571428571428,0.03115079365079365 Helm RAFT,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,1,0.5455447255899809,0.0614649096074132 Helm RAFT,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,2,0.4999999999999999,0.10868055555555556 Helm RAFT,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,3,0.6428571428571428,0.03115079365079365 Helm RAFT,helm_classic_240829.csv,aggregate,aggregate,kendall,random,8,4,0.5714285714285714,0.06101190476190476 MMLU Pro,mmlu_pro_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7142857142857142,0.014136904761904762 MMLU Pro,mmlu_pro_240829.csv,aggregate,aggregate,kendall,random,8,1,0.9999999999999998,4.96031746031746e-05 MMLU Pro,mmlu_pro_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7142857142857142,0.014136904761904762 MMLU Pro,mmlu_pro_240829.csv,aggregate,aggregate,kendall,random,8,3,0.5714285714285714,0.06101190476190476 MMLU Pro,mmlu_pro_240829.csv,aggregate,aggregate,kendall,random,8,4,0.5714285714285714,0.06101190476190476 MixEval,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7142857142857142,0.014136904761904762 MixEval,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,1,0.6428571428571428,0.03115079365079365 MixEval,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7857142857142856,0.005505952380952381 MixEval,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,3,0.6428571428571428,0.03115079365079365 MixEval,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,4,0.9999999999999998,4.96031746031746e-05 MixEval Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7142857142857142,0.014136904761904762 MixEval Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,1,0.3571428571428571,0.27509920634920637 MixEval Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,2,0.9285714285714285,0.0003968253968253968 MixEval Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,3,0.6428571428571428,0.03115079365079365 MixEval Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,4,0.4999999999999999,0.10868055555555556 MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,0,0.42857142857142855,0.17886904761904762 MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,1,0.5714285714285714,0.06101190476190476 MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,2,0.6428571428571428,0.03115079365079365 MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,3,0.4999999999999999,0.10868055555555556 MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,4,0.7857142857142856,0.005505952380952381 MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7857142857142856,0.005505952380952381 MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,1,0.6428571428571428,0.03115079365079365 MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,2,0.9999999999999998,4.96031746031746e-05 MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,3,0.6428571428571428,0.03115079365079365 MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,4,0.8571428571428571,0.001736111111111111 MixEval DROP,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,0,0.5714285714285714,0.06101190476190476 MixEval DROP,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,1,0.4999999999999999,0.10868055555555556 MixEval DROP,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7142857142857142,0.014136904761904762 MixEval DROP,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,3,0.4999999999999999,0.10868055555555556 MixEval DROP,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,4,0.42857142857142855,0.17886904761904762 MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,0,0.5714285714285714,0.06101190476190476 MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7142857142857142,0.014136904761904762 MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,2,0.6910233190806425,0.017844011512848347 MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,3,0.7857142857142856,0.005505952380952381 MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,4,0.5714285714285714,0.06101190476190476 MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,0,0.6428571428571428,0.03115079365079365 MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,1,0.4999999999999999,0.10868055555555556 MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,2,0.8571428571428571,0.001736111111111111 MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,3,0.6182840223353117,0.0340492747686748 MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,4,0.9285714285714285,0.0003968253968253968 MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7142857142857142,0.014136904761904762 MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,1,0.3571428571428571,0.27509920634920637 MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7142857142857142,0.014136904761904762 MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,3,0.7142857142857142,0.014136904761904762 MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,4,0.3571428571428571,0.27509920634920637 MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,0,0.4999999999999999,0.10868055555555556 MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,1,0.22237479499833035,0.45088703102517036 MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7142857142857142,0.014136904761904762 MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,3,0.7857142857142856,0.005505952380952381 MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,4,0.2545875386086578,0.38281014365989596 MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,0,0.5455447255899809,0.0614649096074132 MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,1,0.2857142857142857,0.39875992063492066 MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7637626158259734,0.008839740160738534 MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,3,0.5714285714285714,0.06101190476190476 MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,4,0.14285714285714285,0.7195436507936508 AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,kendall,random,8,0,0.9092412093166348,0.0018276750354536814 AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7637626158259734,0.008839740160738534 AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7142857142857142,0.014136904761904762 AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,kendall,random,8,3,0.5714285714285714,0.06101190476190476 AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,aggregate,aggregate,kendall,random,8,4,0.9285714285714285,0.0003968253968253968 OpenCompass,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,0,0.9285714285714285,0.0003968253968253968 OpenCompass,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7857142857142856,0.005505952380952381 OpenCompass,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7142857142857142,0.014136904761904762 OpenCompass,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,3,0.7857142857142856,0.005505952380952381 OpenCompass,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,4,0.8571428571428571,0.001736111111111111 OpenCompass Language,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,0,0.4999999999999999,0.10868055555555556 OpenCompass Language,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,1,0.0,1.0 OpenCompass Language,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,2,0.3571428571428571,0.27509920634920637 OpenCompass Language,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,3,0.42857142857142855,0.17886904761904762 OpenCompass Language,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,4,0.21428571428571427,0.5484126984126985 OpenCompass Knowledge,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,0,0.5714285714285714,0.06101190476190476 OpenCompass Knowledge,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7857142857142856,0.005505952380952381 OpenCompass Knowledge,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7142857142857142,0.014136904761904762 OpenCompass Knowledge,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,3,0.2857142857142857,0.39875992063492066 OpenCompass Knowledge,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,4,0.7857142857142856,0.005505952380952381 OpenCompass Reasoning,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7142857142857142,0.014136904761904762 OpenCompass Reasoning,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,1,0.4999999999999999,0.10868055555555556 OpenCompass Reasoning,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,2,0.4999999999999999,0.10868055555555556 OpenCompass Reasoning,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,3,0.7142857142857142,0.014136904761904762 OpenCompass Reasoning,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,4,0.42857142857142855,0.17886904761904762 OpenCompass Math,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,0,0.9285714285714285,0.0003968253968253968 OpenCompass Math,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7142857142857142,0.014136904761904762 OpenCompass Math,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7857142857142856,0.005505952380952381 OpenCompass Math,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,3,0.7142857142857142,0.014136904761904762 OpenCompass Math,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,4,0.7857142857142856,0.005505952380952381 OpenCompass Code,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,0,0.8571428571428571,0.001736111111111111 OpenCompass Code,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,1,0.5714285714285714,0.06101190476190476 OpenCompass Code,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,2,0.47280542884465016,0.10506382347888965 OpenCompass Code,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,3,0.7142857142857142,0.014136904761904762 OpenCompass Code,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,4,0.7857142857142856,0.005505952380952381 OpenCompass Instruction,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,0,0.4999999999999999,0.10868055555555556 OpenCompass Instruction,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7142857142857142,0.014136904761904762 OpenCompass Instruction,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,2,0.6428571428571428,0.03115079365079365 OpenCompass Instruction,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,3,0.7142857142857142,0.014136904761904762 OpenCompass Instruction,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,4,0.8571428571428571,0.001736111111111111 OpenCompass Agent,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,0,0.3571428571428571,0.27509920634920637 OpenCompass Agent,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,1,0.42857142857142855,0.17886904761904762 OpenCompass Agent,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,2,0.2857142857142857,0.39875992063492066 OpenCompass Agent,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,3,0.21428571428571427,0.5484126984126985 OpenCompass Agent,opencompass_240829.csv,aggregate,aggregate,kendall,random,8,4,0.7142857142857142,0.014136904761904762 OpenCompass Arena,opencompass_arena_240829.csv,aggregate,aggregate,kendall,random,8,0,0.3571428571428571,0.27509920634920637 OpenCompass Arena,opencompass_arena_240829.csv,aggregate,aggregate,kendall,random,8,1,0.14285714285714285,0.7195436507936508 OpenCompass Arena,opencompass_arena_240829.csv,aggregate,aggregate,kendall,random,8,2,0.3571428571428571,0.27509920634920637 OpenCompass Arena,opencompass_arena_240829.csv,aggregate,aggregate,kendall,random,8,3,0.4999999999999999,0.10868055555555556 OpenCompass Arena,opencompass_arena_240829.csv,aggregate,aggregate,kendall,random,8,4,0.2857142857142857,0.39875992063492066 LiveBench 240725,livebench_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7142857142857142,0.014136904761904762 LiveBench 240725,livebench_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7857142857142856,0.005505952380952381 LiveBench 240725,livebench_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7857142857142856,0.005505952380952381 LiveBench 240725,livebench_240829.csv,aggregate,aggregate,kendall,random,8,3,0.7857142857142856,0.005505952380952381 LiveBench 240725,livebench_240829.csv,aggregate,aggregate,kendall,random,8,4,0.9285714285714285,0.0003968253968253968 LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,kendall,random,8,0,0.6910233190806425,0.017844011512848347 LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,kendall,random,8,1,0.836501912571304,0.004136737098676645 LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,kendall,random,8,2,0.5455447255899809,0.0614649096074132 LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,kendall,random,8,3,0.7857142857142856,0.005505952380952381 LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,kendall,random,8,4,0.9285714285714285,0.0003968253968253968 LiveBench Coding,livebench_240829.csv,aggregate,aggregate,kendall,random,8,0,0.9999999999999998,4.96031746031746e-05 LiveBench Coding,livebench_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7142857142857142,0.014136904761904762 LiveBench Coding,livebench_240829.csv,aggregate,aggregate,kendall,random,8,2,0.9285714285714285,0.0003968253968253968 LiveBench Coding,livebench_240829.csv,aggregate,aggregate,kendall,random,8,3,0.7857142857142856,0.005505952380952381 LiveBench Coding,livebench_240829.csv,aggregate,aggregate,kendall,random,8,4,0.7857142857142856,0.005505952380952381 LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,kendall,random,8,0,0.9285714285714285,0.0003968253968253968 LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7857142857142856,0.005505952380952381 LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7857142857142856,0.005505952380952381 LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,kendall,random,8,3,0.5714285714285714,0.06101190476190476 LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,kendall,random,8,4,0.7857142857142856,0.005505952380952381 LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,kendall,random,8,0,0.9819805060619657,0.0007619896395304237 LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7142857142857142,0.014136904761904762 LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,kendall,random,8,2,0.6428571428571428,0.03115079365079365 LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,kendall,random,8,3,0.7142857142857142,0.014136904761904762 LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,kendall,random,8,4,0.7142857142857142,0.014136904761904762 LiveBench Language,livebench_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7857142857142856,0.005505952380952381 LiveBench Language,livebench_240829.csv,aggregate,aggregate,kendall,random,8,1,0.6428571428571428,0.03115079365079365 LiveBench Language,livebench_240829.csv,aggregate,aggregate,kendall,random,8,2,0.5714285714285714,0.06101190476190476 LiveBench Language,livebench_240829.csv,aggregate,aggregate,kendall,random,8,3,0.7857142857142856,0.005505952380952381 LiveBench Language,livebench_240829.csv,aggregate,aggregate,kendall,random,8,4,0.6428571428571428,0.03115079365079365 LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,kendall,random,8,0,0.6428571428571428,0.03115079365079365 LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,kendall,random,8,1,0.4999999999999999,0.10868055555555556 LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7857142857142856,0.005505952380952381 LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,kendall,random,8,3,0.6428571428571428,0.03115079365079365 LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,kendall,random,8,4,0.7142857142857142,0.014136904761904762 WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,0,0.9285714285714285,0.0003968253968253968 WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,1,0.9999999999999998,4.96031746031746e-05 WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,2,0.9285714285714285,0.0003968253968253968 WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,3,0.9285714285714285,0.0003968253968253968 WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,4,0.9285714285714285,0.0003968253968253968 WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,0,0.6428571428571428,0.03115079365079365 WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7857142857142856,0.005505952380952381 WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7857142857142856,0.005505952380952381 WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,3,0.9285714285714285,0.0003968253968253968 WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,4,0.6428571428571428,0.03115079365079365 WildBench Creative,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7142857142857142,0.014136904761904762 WildBench Creative,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7857142857142856,0.005505952380952381 WildBench Creative,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7142857142857142,0.014136904761904762 WildBench Creative,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,3,0.9285714285714285,0.0003968253968253968 WildBench Creative,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,4,0.4999999999999999,0.10868055555555556 WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,0,0.8571428571428571,0.001736111111111111 WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,1,0.8571428571428571,0.001736111111111111 WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7857142857142856,0.005505952380952381 WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,3,0.9999999999999998,4.96031746031746e-05 WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,4,0.9999999999999998,4.96031746031746e-05 WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,0,0.9999999999999998,4.96031746031746e-05 WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7857142857142856,0.005505952380952381 WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7142857142857142,0.014136904761904762 WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,3,0.8571428571428571,0.001736111111111111 WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,4,0.9999999999999998,4.96031746031746e-05 WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7857142857142856,0.005505952380952381 WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7857142857142856,0.005505952380952381 WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7142857142857142,0.014136904761904762 WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,3,0.9999999999999998,4.96031746031746e-05 WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,4,0.9285714285714285,0.0003968253968253968 WildBench Score,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7857142857142856,0.005505952380952381 WildBench Score,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7857142857142856,0.005505952380952381 WildBench Score,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7142857142857142,0.014136904761904762 WildBench Score,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,3,0.9999999999999998,4.96031746031746e-05 WildBench Score,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,4,0.9999999999999998,4.96031746031746e-05 Arena Hard,arena_hard_240829.csv,aggregate,aggregate,kendall,random,8,0,0.8571428571428571,0.001736111111111111 Arena Hard,arena_hard_240829.csv,aggregate,aggregate,kendall,random,8,1,0.8571428571428571,0.001736111111111111 Arena Hard,arena_hard_240829.csv,aggregate,aggregate,kendall,random,8,2,0.8571428571428571,0.001736111111111111 Arena Hard,arena_hard_240829.csv,aggregate,aggregate,kendall,random,8,3,0.8571428571428571,0.001736111111111111 Arena Hard,arena_hard_240829.csv,aggregate,aggregate,kendall,random,8,4,0.7857142857142856,0.005505952380952381 HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,0,0.7142857142857142,0.014136904761904762 HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,1,0.7857142857142856,0.005505952380952381 HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,2,0.7142857142857142,0.014136904761904762 HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,3,0.3571428571428571,0.27509920634920637 HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,4,0.42857142857142855,0.17886904761904762 HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,0,0.7857142857142856,0.005505952380952381 HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,1,0.7857142857142856,0.005505952380952381 HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,2,0.5455447255899809,0.0614649096074132 HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,3,0.2857142857142857,0.39875992063492066 HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,4,0.4999999999999999,0.10868055555555556 HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,0,0.6428571428571428,0.03115079365079365 HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,1,0.5714285714285714,0.06101190476190476 HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,2,0.5714285714285714,0.06101190476190476 HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,3,0.42857142857142855,0.17886904761904762 HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,4,0.21428571428571427,0.5484126984126985 HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,0,0.7857142857142856,0.005505952380952381 HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,1,0.7142857142857142,0.014136904761904762 HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,2,0.5714285714285714,0.06101190476190476 HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,3,0.14285714285714285,0.7195436507936508 HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,4,0.4999999999999999,0.10868055555555556 HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,0,0.5714285714285714,0.06101190476190476 HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,1,0.8571428571428571,0.001736111111111111 HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,2,0.7857142857142856,0.005505952380952381 HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,3,0.21428571428571427,0.5484126984126985 HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,4,0.5714285714285714,0.06101190476190476 HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,0,0.42857142857142855,0.17886904761904762 HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,1,0.8571428571428571,0.001736111111111111 HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,2,0.2857142857142857,0.39875992063492066 HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,3,-0.07142857142857142,0.9048611111111111 HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,4,0.21428571428571427,0.5484126984126985 HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,0,0.7142857142857142,0.014136904761904762 HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,1,0.6428571428571428,0.03115079365079365 HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,2,0.42857142857142855,0.17886904761904762 HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,3,0.14285714285714285,0.7195436507936508 HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,aggregate,aggregate,kendall,random,8,4,0.47280542884465016,0.10506382347888965 BFCL,bfcl_240906.csv,aggregate,aggregate,kendall,random,8,0,0.42857142857142855,0.17886904761904762 BFCL,bfcl_240906.csv,aggregate,aggregate,kendall,random,8,1,0.5714285714285714,0.06101190476190476 BFCL,bfcl_240906.csv,aggregate,aggregate,kendall,random,8,2,0.5714285714285714,0.06101190476190476 BFCL,bfcl_240906.csv,aggregate,aggregate,kendall,random,8,3,0.2857142857142857,0.39875992063492066 BFCL,bfcl_240906.csv,aggregate,aggregate,kendall,random,8,4,0.4999999999999999,0.10868055555555556 BIGGEN,biggen_240829.csv,aggregate,aggregate,kendall,random,8,0,0.4999999999999999,0.10868055555555556 BIGGEN,biggen_240829.csv,aggregate,aggregate,kendall,random,8,1,0.6428571428571428,0.03115079365079365 BIGGEN,biggen_240829.csv,aggregate,aggregate,kendall,random,8,2,0.5714285714285714,0.06101190476190476 BIGGEN,biggen_240829.csv,aggregate,aggregate,kendall,random,8,3,0.21428571428571427,0.5484126984126985 BIGGEN,biggen_240829.csv,aggregate,aggregate,kendall,random,8,4,-0.3571428571428571,0.27509920634920637 BIGGEN Grounding,biggen_240829.csv,aggregate,aggregate,kendall,random,8,0,0.9285714285714285,0.0003968253968253968 BIGGEN Grounding,biggen_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7857142857142856,0.005505952380952381 BIGGEN Grounding,biggen_240829.csv,aggregate,aggregate,kendall,random,8,2,0.47280542884465016,0.10506382347888965 BIGGEN Grounding,biggen_240829.csv,aggregate,aggregate,kendall,random,8,3,0.3571428571428571,0.27509920634920637 BIGGEN Grounding,biggen_240829.csv,aggregate,aggregate,kendall,random,8,4,0.18184824186332696,0.5330356744917513 BIGGEN Instruction Following,biggen_240829.csv,aggregate,aggregate,kendall,random,8,0,0.5714285714285714,0.06101190476190476 BIGGEN Instruction Following,biggen_240829.csv,aggregate,aggregate,kendall,random,8,1,0.6428571428571428,0.03115079365079365 BIGGEN Instruction Following,biggen_240829.csv,aggregate,aggregate,kendall,random,8,2,0.5714285714285714,0.06101190476190476 BIGGEN Instruction Following,biggen_240829.csv,aggregate,aggregate,kendall,random,8,3,0.10910894511799618,0.7083840532183997 BIGGEN Instruction Following,biggen_240829.csv,aggregate,aggregate,kendall,random,8,4,-0.40006613209931935,0.17023995462900499 BIGGEN Planning,biggen_240829.csv,aggregate,aggregate,kendall,random,8,0,0.47280542884465016,0.10506382347888965 BIGGEN Planning,biggen_240829.csv,aggregate,aggregate,kendall,random,8,1,0.6182840223353117,0.0340492747686748 BIGGEN Planning,biggen_240829.csv,aggregate,aggregate,kendall,random,8,2,0.5455447255899809,0.0614649096074132 BIGGEN Planning,biggen_240829.csv,aggregate,aggregate,kendall,random,8,3,0.21428571428571427,0.5484126984126985 BIGGEN Planning,biggen_240829.csv,aggregate,aggregate,kendall,random,8,4,-0.42857142857142855,0.17886904761904762 BIGGEN Reasoning,biggen_240829.csv,aggregate,aggregate,kendall,random,8,0,0.5714285714285714,0.06101190476190476 BIGGEN Reasoning,biggen_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7857142857142856,0.005505952380952381 BIGGEN Reasoning,biggen_240829.csv,aggregate,aggregate,kendall,random,8,2,0.5714285714285714,0.06101190476190476 BIGGEN Reasoning,biggen_240829.csv,aggregate,aggregate,kendall,random,8,3,0.2857142857142857,0.39875992063492066 BIGGEN Reasoning,biggen_240829.csv,aggregate,aggregate,kendall,random,8,4,0.0,1.0 BIGGEN Refinement,biggen_240829.csv,aggregate,aggregate,kendall,random,8,0,0.6428571428571428,0.03115079365079365 BIGGEN Refinement,biggen_240829.csv,aggregate,aggregate,kendall,random,8,1,0.5714285714285714,0.06101190476190476 BIGGEN Refinement,biggen_240829.csv,aggregate,aggregate,kendall,random,8,2,0.5714285714285714,0.06101190476190476 BIGGEN Refinement,biggen_240829.csv,aggregate,aggregate,kendall,random,8,3,0.21428571428571427,0.5484126984126985 BIGGEN Refinement,biggen_240829.csv,aggregate,aggregate,kendall,random,8,4,-0.07142857142857142,0.9048611111111111 BIGGEN Safety,biggen_240829.csv,aggregate,aggregate,kendall,random,8,0,0.2857142857142857,0.39875992063492066 BIGGEN Safety,biggen_240829.csv,aggregate,aggregate,kendall,random,8,1,0.42857142857142855,0.17886904761904762 BIGGEN Safety,biggen_240829.csv,aggregate,aggregate,kendall,random,8,2,0.14285714285714285,0.7195436507936508 BIGGEN Safety,biggen_240829.csv,aggregate,aggregate,kendall,random,8,3,-0.14285714285714285,0.7195436507936508 BIGGEN Safety,biggen_240829.csv,aggregate,aggregate,kendall,random,8,4,-0.42857142857142855,0.17886904761904762 BIGGEN Theory of Mind,biggen_240829.csv,aggregate,aggregate,kendall,random,8,0,0.42857142857142855,0.17886904761904762 BIGGEN Theory of Mind,biggen_240829.csv,aggregate,aggregate,kendall,random,8,1,0.5714285714285714,0.06101190476190476 BIGGEN Theory of Mind,biggen_240829.csv,aggregate,aggregate,kendall,random,8,2,0.42857142857142855,0.17886904761904762 BIGGEN Theory of Mind,biggen_240829.csv,aggregate,aggregate,kendall,random,8,3,0.21428571428571427,0.5484126984126985 BIGGEN Theory of Mind,biggen_240829.csv,aggregate,aggregate,kendall,random,8,4,-0.2545875386086578,0.38281014365989596 BIGGEN Tool Usage,biggen_240829.csv,aggregate,aggregate,kendall,random,8,0,0.6428571428571428,0.03115079365079365 BIGGEN Tool Usage,biggen_240829.csv,aggregate,aggregate,kendall,random,8,1,0.6428571428571428,0.03115079365079365 BIGGEN Tool Usage,biggen_240829.csv,aggregate,aggregate,kendall,random,8,2,0.6428571428571428,0.03115079365079365 BIGGEN Tool Usage,biggen_240829.csv,aggregate,aggregate,kendall,random,8,3,0.18184824186332696,0.5330356744917513 BIGGEN Tool Usage,biggen_240829.csv,aggregate,aggregate,kendall,random,8,4,-0.21428571428571427,0.5484126984126985 BIGGEN Multilingual,biggen_240829.csv,aggregate,aggregate,kendall,random,8,0,0.4999999999999999,0.10868055555555556 BIGGEN Multilingual,biggen_240829.csv,aggregate,aggregate,kendall,random,8,1,0.5714285714285714,0.06101190476190476 BIGGEN Multilingual,biggen_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7637626158259734,0.008839740160738534 BIGGEN Multilingual,biggen_240829.csv,aggregate,aggregate,kendall,random,8,3,0.6910233190806425,0.017844011512848347 BIGGEN Multilingual,biggen_240829.csv,aggregate,aggregate,kendall,random,8,4,0.6428571428571428,0.03115079365079365 LiveBench 240624,livebench_240701.csv,aggregate,aggregate,kendall,random,8,0,0.9285714285714285,0.0003968253968253968 LiveBench 240624,livebench_240701.csv,aggregate,aggregate,kendall,random,8,1,0.8571428571428571,0.001736111111111111 LiveBench 240624,livebench_240701.csv,aggregate,aggregate,kendall,random,8,2,0.8571428571428571,0.001736111111111111 LiveBench 240624,livebench_240701.csv,aggregate,aggregate,kendall,random,8,3,0.7857142857142856,0.005505952380952381 LiveBench 240624,livebench_240701.csv,aggregate,aggregate,kendall,random,8,4,0.8571428571428571,0.001736111111111111 LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,0,0.9285714285714285,0.0003968253968253968 LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,1,0.8571428571428571,0.001736111111111111 LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,2,0.6910233190806425,0.017844011512848347 LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,3,0.836501912571304,0.004136737098676645 LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,4,0.9285714285714285,0.0003968253968253968 LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,0,0.9999999999999998,4.96031746031746e-05 LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,1,0.9999999999999998,4.96031746031746e-05 LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,2,0.6428571428571428,0.03115079365079365 LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,3,0.7637626158259734,0.008839740160738534 LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,4,0.7142857142857142,0.014136904761904762 LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,0,0.9999999999999998,4.96031746031746e-05 LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,1,0.9999999999999998,4.96031746031746e-05 LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,2,0.7857142857142856,0.005505952380952381 LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,3,0.7857142857142856,0.005505952380952381 LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,4,0.7857142857142856,0.005505952380952381 LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,0,0.9999999999999998,4.96031746031746e-05 LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,1,0.8571428571428571,0.001736111111111111 LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,2,0.6428571428571428,0.03115079365079365 LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,3,0.6182840223353117,0.0340492747686748 LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,4,0.7857142857142856,0.005505952380952381 LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,0,0.7857142857142856,0.005505952380952381 LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,1,0.7142857142857142,0.014136904761904762 LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,2,0.7857142857142856,0.005505952380952381 LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,3,0.6428571428571428,0.03115079365079365 LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,4,0.8571428571428571,0.001736111111111111 LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,0,0.8571428571428571,0.001736111111111111 LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,1,0.4999999999999999,0.10868055555555556 LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,2,0.6428571428571428,0.03115079365079365 LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,3,0.6428571428571428,0.03115079365079365 LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,4,0.7142857142857142,0.014136904761904762 aggregate,aggregate,Holmes,holmes_240829.csv,kendall,random,8,0,-0.14285714285714285,0.7195436507936508 aggregate,aggregate,Holmes,holmes_240829.csv,kendall,random,8,1,-0.14285714285714285,0.7195436507936508 aggregate,aggregate,Holmes,holmes_240829.csv,kendall,random,8,2,-0.2857142857142857,0.39875992063492066 aggregate,aggregate,Holmes,holmes_240829.csv,kendall,random,8,3,-0.21428571428571427,0.5484126984126985 aggregate,aggregate,Holmes,holmes_240829.csv,kendall,random,8,4,-0.07142857142857142,0.9048611111111111 aggregate,aggregate,Helm Lite,helm_lite_240829.csv,kendall,random,8,0,0.5714285714285714,0.06101190476190476 aggregate,aggregate,Helm Lite,helm_lite_240829.csv,kendall,random,8,1,0.6182840223353117,0.0340492747686748 aggregate,aggregate,Helm Lite,helm_lite_240829.csv,kendall,random,8,2,0.6428571428571428,0.03115079365079365 aggregate,aggregate,Helm Lite,helm_lite_240829.csv,kendall,random,8,3,0.8571428571428571,0.001736111111111111 aggregate,aggregate,Helm Lite,helm_lite_240829.csv,kendall,random,8,4,0.9285714285714285,0.0003968253968253968 aggregate,aggregate,Helm Lite NarrativeQA,helm_lite_240829.csv,kendall,random,8,0,-0.07142857142857142,0.9048611111111111 aggregate,aggregate,Helm Lite NarrativeQA,helm_lite_240829.csv,kendall,random,8,1,-0.3571428571428571,0.27509920634920637 aggregate,aggregate,Helm Lite NarrativeQA,helm_lite_240829.csv,kendall,random,8,2,-0.07142857142857142,0.9048611111111111 aggregate,aggregate,Helm Lite NarrativeQA,helm_lite_240829.csv,kendall,random,8,3,0.21428571428571427,0.5484126984126985 aggregate,aggregate,Helm Lite NarrativeQA,helm_lite_240829.csv,kendall,random,8,4,-0.3571428571428571,0.27509920634920637 aggregate,aggregate,Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,kendall,random,8,0,0.2857142857142857,0.39875992063492066 aggregate,aggregate,Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,kendall,random,8,1,0.0,1.0 aggregate,aggregate,Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,kendall,random,8,2,0.3571428571428571,0.27509920634920637 aggregate,aggregate,Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,kendall,random,8,3,0.14285714285714285,0.7195436507936508 aggregate,aggregate,Helm Lite NaturalQuestionsOpen,helm_lite_240829.csv,kendall,random,8,4,0.2857142857142857,0.39875992063492066 aggregate,aggregate,Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,kendall,random,8,0,0.5714285714285714,0.06101190476190476 aggregate,aggregate,Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,kendall,random,8,1,0.2857142857142857,0.39875992063492066 aggregate,aggregate,Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,kendall,random,8,2,0.3571428571428571,0.27509920634920637 aggregate,aggregate,Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,kendall,random,8,3,0.42857142857142855,0.17886904761904762 aggregate,aggregate,Helm Lite NaturalQuestionsClosed,helm_lite_240829.csv,kendall,random,8,4,0.5714285714285714,0.06101190476190476 aggregate,aggregate,Helm Lite OpenBookQA,helm_lite_240829.csv,kendall,random,8,0,0.5714285714285714,0.06101190476190476 aggregate,aggregate,Helm Lite OpenBookQA,helm_lite_240829.csv,kendall,random,8,1,0.9285714285714285,0.0003968253968253968 aggregate,aggregate,Helm Lite OpenBookQA,helm_lite_240829.csv,kendall,random,8,2,0.4999999999999999,0.10868055555555556 aggregate,aggregate,Helm Lite OpenBookQA,helm_lite_240829.csv,kendall,random,8,3,0.9285714285714285,0.0003968253968253968 aggregate,aggregate,Helm Lite OpenBookQA,helm_lite_240829.csv,kendall,random,8,4,0.8571428571428571,0.001736111111111111 aggregate,aggregate,Helm Lite MMLU,helm_lite_240829.csv,kendall,random,8,0,0.5714285714285714,0.06101190476190476 aggregate,aggregate,Helm Lite MMLU,helm_lite_240829.csv,kendall,random,8,1,0.7142857142857142,0.014136904761904762 aggregate,aggregate,Helm Lite MMLU,helm_lite_240829.csv,kendall,random,8,2,0.4999999999999999,0.10868055555555556 aggregate,aggregate,Helm Lite MMLU,helm_lite_240829.csv,kendall,random,8,3,0.7142857142857142,0.014136904761904762 aggregate,aggregate,Helm Lite MMLU,helm_lite_240829.csv,kendall,random,8,4,0.9999999999999998,4.96031746031746e-05 aggregate,aggregate,Helm Lite MathEquivalentCOT,helm_lite_240829.csv,kendall,random,8,0,0.6428571428571428,0.03115079365079365 aggregate,aggregate,Helm Lite MathEquivalentCOT,helm_lite_240829.csv,kendall,random,8,1,0.4999999999999999,0.10868055555555556 aggregate,aggregate,Helm Lite MathEquivalentCOT,helm_lite_240829.csv,kendall,random,8,2,0.4999999999999999,0.10868055555555556 aggregate,aggregate,Helm Lite MathEquivalentCOT,helm_lite_240829.csv,kendall,random,8,3,0.5714285714285714,0.06101190476190476 aggregate,aggregate,Helm Lite MathEquivalentCOT,helm_lite_240829.csv,kendall,random,8,4,0.8571428571428571,0.001736111111111111 aggregate,aggregate,Helm Lite GSM8K,helm_lite_240829.csv,kendall,random,8,0,0.7142857142857142,0.014136904761904762 aggregate,aggregate,Helm Lite GSM8K,helm_lite_240829.csv,kendall,random,8,1,0.7142857142857142,0.014136904761904762 aggregate,aggregate,Helm Lite GSM8K,helm_lite_240829.csv,kendall,random,8,2,0.5714285714285714,0.06101190476190476 aggregate,aggregate,Helm Lite GSM8K,helm_lite_240829.csv,kendall,random,8,3,0.42857142857142855,0.17886904761904762 aggregate,aggregate,Helm Lite GSM8K,helm_lite_240829.csv,kendall,random,8,4,0.5714285714285714,0.06101190476190476 aggregate,aggregate,Helm Lite LegalBench,helm_lite_240829.csv,kendall,random,8,0,0.7142857142857142,0.014136904761904762 aggregate,aggregate,Helm Lite LegalBench,helm_lite_240829.csv,kendall,random,8,1,0.42857142857142855,0.17886904761904762 aggregate,aggregate,Helm Lite LegalBench,helm_lite_240829.csv,kendall,random,8,2,0.5455447255899809,0.0614649096074132 aggregate,aggregate,Helm Lite LegalBench,helm_lite_240829.csv,kendall,random,8,3,0.7637626158259734,0.008839740160738534 aggregate,aggregate,Helm Lite LegalBench,helm_lite_240829.csv,kendall,random,8,4,0.7142857142857142,0.014136904761904762 aggregate,aggregate,Helm Lite MedQA,helm_lite_240829.csv,kendall,random,8,0,0.6428571428571428,0.03115079365079365 aggregate,aggregate,Helm Lite MedQA,helm_lite_240829.csv,kendall,random,8,1,0.5455447255899809,0.0614649096074132 aggregate,aggregate,Helm Lite MedQA,helm_lite_240829.csv,kendall,random,8,2,0.4999999999999999,0.10868055555555556 aggregate,aggregate,Helm Lite MedQA,helm_lite_240829.csv,kendall,random,8,3,0.7142857142857142,0.014136904761904762 aggregate,aggregate,Helm Lite MedQA,helm_lite_240829.csv,kendall,random,8,4,0.8571428571428571,0.001736111111111111 aggregate,aggregate,Helm Lite WMT2014,helm_lite_240829.csv,kendall,random,8,0,0.6428571428571428,0.03115079365079365 aggregate,aggregate,Helm Lite WMT2014,helm_lite_240829.csv,kendall,random,8,1,0.2857142857142857,0.39875992063492066 aggregate,aggregate,Helm Lite WMT2014,helm_lite_240829.csv,kendall,random,8,2,0.6428571428571428,0.03115079365079365 aggregate,aggregate,Helm Lite WMT2014,helm_lite_240829.csv,kendall,random,8,3,0.9285714285714285,0.0003968253968253968 aggregate,aggregate,Helm Lite WMT2014,helm_lite_240829.csv,kendall,random,8,4,0.9092412093166348,0.0018276750354536814 aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,kendall,random,8,0,0.3571428571428571,0.27509920634920637 aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,kendall,random,8,1,0.5714285714285714,0.06101190476190476 aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,kendall,random,8,2,0.7857142857142856,0.005505952380952381 aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,kendall,random,8,3,0.7857142857142856,0.005505952380952381 aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,kendall,random,8,4,0.6428571428571428,0.03115079365079365 aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,kendall,random,8,0,0.7142857142857142,0.014136904761904762 aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,kendall,random,8,1,0.5714285714285714,0.06101190476190476 aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,kendall,random,8,2,0.6428571428571428,0.03115079365079365 aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,kendall,random,8,3,0.9285714285714285,0.0003968253968253968 aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,kendall,random,8,4,0.7637626158259734,0.008839740160738534 aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,kendall,random,8,0,0.4999999999999999,0.10868055555555556 aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,kendall,random,8,1,0.7857142857142856,0.005505952380952381 aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,kendall,random,8,2,0.3401680257083045,0.25175949861106117 aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,kendall,random,8,3,0.7857142857142856,0.005505952380952381 aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,kendall,random,8,4,0.7857142857142856,0.005505952380952381 aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,kendall,random,8,0,0.0,1.0 aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,kendall,random,8,1,0.14285714285714285,0.7195436507936508 aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,kendall,random,8,2,0.8571428571428571,0.001736111111111111 aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,kendall,random,8,3,0.42857142857142855,0.17886904761904762 aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,kendall,random,8,4,0.4999999999999999,0.10868055555555556 aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,kendall,random,8,0,0.6428571428571428,0.03115079365079365 aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,kendall,random,8,1,0.7857142857142856,0.005505952380952381 aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,kendall,random,8,2,0.8571428571428571,0.001736111111111111 aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,kendall,random,8,3,0.8571428571428571,0.001736111111111111 aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,kendall,random,8,4,0.8571428571428571,0.001736111111111111 aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,kendall,random,8,0,0.21428571428571427,0.5484126984126985 aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,kendall,random,8,1,0.5455447255899809,0.0614649096074132 aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,kendall,random,8,2,0.4999999999999999,0.10868055555555556 aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,kendall,random,8,3,0.18184824186332696,0.5330356744917513 aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,kendall,random,8,4,0.2545875386086578,0.38281014365989596 aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,kendall,random,8,0,0.42857142857142855,0.17886904761904762 aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,kendall,random,8,1,0.10910894511799618,0.7083840532183997 aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,kendall,random,8,2,0.21428571428571427,0.5484126984126985 aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,kendall,random,8,3,0.21428571428571427,0.5484126984126985 aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,kendall,random,8,4,0.5714285714285714,0.06101190476190476 aggregate,aggregate,OpenCompass Academic,opencompass_academic_240829.csv,kendall,random,8,0,0.8571428571428571,0.001736111111111111 aggregate,aggregate,OpenCompass Academic,opencompass_academic_240829.csv,kendall,random,8,1,0.7857142857142856,0.005505952380952381 aggregate,aggregate,OpenCompass Academic,opencompass_academic_240829.csv,kendall,random,8,2,0.8571428571428571,0.001736111111111111 aggregate,aggregate,OpenCompass Academic,opencompass_academic_240829.csv,kendall,random,8,3,0.9285714285714285,0.0003968253968253968 aggregate,aggregate,OpenCompass Academic,opencompass_academic_240829.csv,kendall,random,8,4,0.8571428571428571,0.001736111111111111 aggregate,aggregate,OpenCompass MMLU,opencompass_academic_240829.csv,kendall,random,8,0,0.5714285714285714,0.06101190476190476 aggregate,aggregate,OpenCompass MMLU,opencompass_academic_240829.csv,kendall,random,8,1,0.6428571428571428,0.03115079365079365 aggregate,aggregate,OpenCompass MMLU,opencompass_academic_240829.csv,kendall,random,8,2,0.9285714285714285,0.0003968253968253968 aggregate,aggregate,OpenCompass MMLU,opencompass_academic_240829.csv,kendall,random,8,3,0.8571428571428571,0.001736111111111111 aggregate,aggregate,OpenCompass MMLU,opencompass_academic_240829.csv,kendall,random,8,4,0.7857142857142856,0.005505952380952381 aggregate,aggregate,OpenCompass MMLU Pro,opencompass_academic_240829.csv,kendall,random,8,0,0.5714285714285714,0.06101190476190476 aggregate,aggregate,OpenCompass MMLU Pro,opencompass_academic_240829.csv,kendall,random,8,1,0.7142857142857142,0.014136904761904762 aggregate,aggregate,OpenCompass MMLU Pro,opencompass_academic_240829.csv,kendall,random,8,2,0.9285714285714285,0.0003968253968253968 aggregate,aggregate,OpenCompass MMLU Pro,opencompass_academic_240829.csv,kendall,random,8,3,0.8571428571428571,0.001736111111111111 aggregate,aggregate,OpenCompass MMLU Pro,opencompass_academic_240829.csv,kendall,random,8,4,0.7142857142857142,0.014136904761904762 aggregate,aggregate,OpenCompass CMMLU,opencompass_academic_240829.csv,kendall,random,8,0,0.7857142857142856,0.005505952380952381 aggregate,aggregate,OpenCompass CMMLU,opencompass_academic_240829.csv,kendall,random,8,1,0.5714285714285714,0.06101190476190476 aggregate,aggregate,OpenCompass CMMLU,opencompass_academic_240829.csv,kendall,random,8,2,0.6428571428571428,0.03115079365079365 aggregate,aggregate,OpenCompass CMMLU,opencompass_academic_240829.csv,kendall,random,8,3,0.7857142857142856,0.005505952380952381 aggregate,aggregate,OpenCompass CMMLU,opencompass_academic_240829.csv,kendall,random,8,4,0.7142857142857142,0.014136904761904762 aggregate,aggregate,OpenCompass BBH,opencompass_academic_240829.csv,kendall,random,8,0,0.7142857142857142,0.014136904761904762 aggregate,aggregate,OpenCompass BBH,opencompass_academic_240829.csv,kendall,random,8,1,0.8571428571428571,0.001736111111111111 aggregate,aggregate,OpenCompass BBH,opencompass_academic_240829.csv,kendall,random,8,2,0.8571428571428571,0.001736111111111111 aggregate,aggregate,OpenCompass BBH,opencompass_academic_240829.csv,kendall,random,8,3,0.8571428571428571,0.001736111111111111 aggregate,aggregate,OpenCompass BBH,opencompass_academic_240829.csv,kendall,random,8,4,0.7857142857142856,0.005505952380952381 aggregate,aggregate,OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,kendall,random,8,0,0.7142857142857142,0.014136904761904762 aggregate,aggregate,OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,kendall,random,8,1,0.6910233190806425,0.017844011512848347 aggregate,aggregate,OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,kendall,random,8,2,0.9999999999999998,4.96031746031746e-05 aggregate,aggregate,OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,kendall,random,8,3,0.42857142857142855,0.17886904761904762 aggregate,aggregate,OpenCompass GQPA-Dimand,opencompass_academic_240829.csv,kendall,random,8,4,0.40006613209931935,0.17023995462900499 aggregate,aggregate,OpenCompass HumanEval,opencompass_academic_240829.csv,kendall,random,8,0,0.6428571428571428,0.03115079365079365 aggregate,aggregate,OpenCompass HumanEval,opencompass_academic_240829.csv,kendall,random,8,1,0.5929994533288809,0.04437842734548688 aggregate,aggregate,OpenCompass HumanEval,opencompass_academic_240829.csv,kendall,random,8,2,0.6910233190806425,0.017844011512848347 aggregate,aggregate,OpenCompass HumanEval,opencompass_academic_240829.csv,kendall,random,8,3,0.5455447255899809,0.0614649096074132 aggregate,aggregate,OpenCompass HumanEval,opencompass_academic_240829.csv,kendall,random,8,4,0.6910233190806425,0.017844011512848347 aggregate,aggregate,OpenCompass IFEval,opencompass_academic_240829.csv,kendall,random,8,0,0.6428571428571428,0.03115079365079365 aggregate,aggregate,OpenCompass IFEval,opencompass_academic_240829.csv,kendall,random,8,1,0.6428571428571428,0.03115079365079365 aggregate,aggregate,OpenCompass IFEval,opencompass_academic_240829.csv,kendall,random,8,2,0.7857142857142856,0.005505952380952381 aggregate,aggregate,OpenCompass IFEval,opencompass_academic_240829.csv,kendall,random,8,3,0.5714285714285714,0.06101190476190476 aggregate,aggregate,OpenCompass IFEval,opencompass_academic_240829.csv,kendall,random,8,4,0.5714285714285714,0.06101190476190476 aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,kendall,random,8,0,0.9285714285714285,0.0003968253968253968 aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,kendall,random,8,1,0.7142857142857142,0.014136904761904762 aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,kendall,random,8,2,0.7857142857142856,0.005505952380952381 aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,kendall,random,8,3,0.5714285714285714,0.06101190476190476 aggregate,aggregate,Helm MMLU,helm_mmlu_240829.csv,kendall,random,8,4,0.42857142857142855,0.17886904761904762 aggregate,aggregate,LMSys Arena,chatbot_arena_240829.csv,kendall,random,8,0,0.8571428571428571,0.001736111111111111 aggregate,aggregate,LMSys Arena,chatbot_arena_240829.csv,kendall,random,8,1,0.8571428571428571,0.001736111111111111 aggregate,aggregate,LMSys Arena,chatbot_arena_240829.csv,kendall,random,8,2,0.8571428571428571,0.001736111111111111 aggregate,aggregate,LMSys Arena,chatbot_arena_240829.csv,kendall,random,8,3,0.8571428571428571,0.001736111111111111 aggregate,aggregate,LMSys Arena,chatbot_arena_240829.csv,kendall,random,8,4,0.7857142857142856,0.005505952380952381 aggregate,aggregate,Helm Classic,helm_classic_240829.csv,kendall,random,8,0,0.9285714285714285,0.0003968253968253968 aggregate,aggregate,Helm Classic,helm_classic_240829.csv,kendall,random,8,1,0.9285714285714285,0.0003968253968253968 aggregate,aggregate,Helm Classic,helm_classic_240829.csv,kendall,random,8,2,0.9285714285714285,0.0003968253968253968 aggregate,aggregate,Helm Classic,helm_classic_240829.csv,kendall,random,8,3,0.8571428571428571,0.001736111111111111 aggregate,aggregate,Helm Classic,helm_classic_240829.csv,kendall,random,8,4,0.9285714285714285,0.0003968253968253968 aggregate,aggregate,Helm BoolQ,helm_classic_240829.csv,kendall,random,8,0,0.5714285714285714,0.06101190476190476 aggregate,aggregate,Helm BoolQ,helm_classic_240829.csv,kendall,random,8,1,0.9285714285714285,0.0003968253968253968 aggregate,aggregate,Helm BoolQ,helm_classic_240829.csv,kendall,random,8,2,0.7857142857142856,0.005505952380952381 aggregate,aggregate,Helm BoolQ,helm_classic_240829.csv,kendall,random,8,3,0.9285714285714285,0.0003968253968253968 aggregate,aggregate,Helm BoolQ,helm_classic_240829.csv,kendall,random,8,4,0.4999999999999999,0.10868055555555556 aggregate,aggregate,Helm NarrativeQA,helm_classic_240829.csv,kendall,random,8,0,0.42857142857142855,0.17886904761904762 aggregate,aggregate,Helm NarrativeQA,helm_classic_240829.csv,kendall,random,8,1,0.7142857142857142,0.014136904761904762 aggregate,aggregate,Helm NarrativeQA,helm_classic_240829.csv,kendall,random,8,2,0.4999999999999999,0.10868055555555556 aggregate,aggregate,Helm NarrativeQA,helm_classic_240829.csv,kendall,random,8,3,0.5714285714285714,0.06101190476190476 aggregate,aggregate,Helm NarrativeQA,helm_classic_240829.csv,kendall,random,8,4,0.4999999999999999,0.10868055555555556 aggregate,aggregate,Helm NaturalQuestionsClosed,helm_classic_240829.csv,kendall,random,8,0,0.6428571428571428,0.03115079365079365 aggregate,aggregate,Helm NaturalQuestionsClosed,helm_classic_240829.csv,kendall,random,8,1,0.7857142857142856,0.005505952380952381 aggregate,aggregate,Helm NaturalQuestionsClosed,helm_classic_240829.csv,kendall,random,8,2,0.6428571428571428,0.03115079365079365 aggregate,aggregate,Helm NaturalQuestionsClosed,helm_classic_240829.csv,kendall,random,8,3,0.7142857142857142,0.014136904761904762 aggregate,aggregate,Helm NaturalQuestionsClosed,helm_classic_240829.csv,kendall,random,8,4,0.7142857142857142,0.014136904761904762 aggregate,aggregate,Helm NaturalQuestionsOpen,helm_classic_240829.csv,kendall,random,8,0,0.6910233190806425,0.017844011512848347 aggregate,aggregate,Helm NaturalQuestionsOpen,helm_classic_240829.csv,kendall,random,8,1,0.7142857142857142,0.014136904761904762 aggregate,aggregate,Helm NaturalQuestionsOpen,helm_classic_240829.csv,kendall,random,8,2,0.7142857142857142,0.014136904761904762 aggregate,aggregate,Helm NaturalQuestionsOpen,helm_classic_240829.csv,kendall,random,8,3,0.8571428571428571,0.001736111111111111 aggregate,aggregate,Helm NaturalQuestionsOpen,helm_classic_240829.csv,kendall,random,8,4,0.7637626158259734,0.008839740160738534 aggregate,aggregate,Helm QuAC,helm_classic_240829.csv,kendall,random,8,0,0.21428571428571427,0.5484126984126985 aggregate,aggregate,Helm QuAC,helm_classic_240829.csv,kendall,random,8,1,0.3571428571428571,0.27509920634920637 aggregate,aggregate,Helm QuAC,helm_classic_240829.csv,kendall,random,8,2,0.4999999999999999,0.10868055555555556 aggregate,aggregate,Helm QuAC,helm_classic_240829.csv,kendall,random,8,3,0.9285714285714285,0.0003968253968253968 aggregate,aggregate,Helm QuAC,helm_classic_240829.csv,kendall,random,8,4,0.5714285714285714,0.06101190476190476 aggregate,aggregate,Helm OpenBookQA,helm_classic_240829.csv,kendall,random,8,0,0.8571428571428571,0.001736111111111111 aggregate,aggregate,Helm OpenBookQA,helm_classic_240829.csv,kendall,random,8,1,0.5714285714285714,0.06101190476190476 aggregate,aggregate,Helm OpenBookQA,helm_classic_240829.csv,kendall,random,8,2,0.07142857142857142,0.9048611111111111 aggregate,aggregate,Helm OpenBookQA,helm_classic_240829.csv,kendall,random,8,3,0.7142857142857142,0.014136904761904762 aggregate,aggregate,Helm OpenBookQA,helm_classic_240829.csv,kendall,random,8,4,0.6428571428571428,0.03115079365079365 aggregate,aggregate,Helm IMDB,helm_classic_240829.csv,kendall,random,8,0,0.21428571428571427,0.5484126984126985 aggregate,aggregate,Helm IMDB,helm_classic_240829.csv,kendall,random,8,1,0.42857142857142855,0.17886904761904762 aggregate,aggregate,Helm IMDB,helm_classic_240829.csv,kendall,random,8,2,0.3706246583305506,0.20891238174069848 aggregate,aggregate,Helm IMDB,helm_classic_240829.csv,kendall,random,8,3,0.3571428571428571,0.27509920634920637 aggregate,aggregate,Helm IMDB,helm_classic_240829.csv,kendall,random,8,4,0.40006613209931935,0.17023995462900499 aggregate,aggregate,Helm CivilComments,helm_classic_240829.csv,kendall,random,8,0,0.5714285714285714,0.06101190476190476 aggregate,aggregate,Helm CivilComments,helm_classic_240829.csv,kendall,random,8,1,0.7142857142857142,0.014136904761904762 aggregate,aggregate,Helm CivilComments,helm_classic_240829.csv,kendall,random,8,2,0.42857142857142855,0.17886904761904762 aggregate,aggregate,Helm CivilComments,helm_classic_240829.csv,kendall,random,8,3,0.6428571428571428,0.03115079365079365 aggregate,aggregate,Helm CivilComments,helm_classic_240829.csv,kendall,random,8,4,0.8571428571428571,0.001736111111111111 aggregate,aggregate,Helm RAFT,helm_classic_240829.csv,kendall,random,8,0,0.6428571428571428,0.03115079365079365 aggregate,aggregate,Helm RAFT,helm_classic_240829.csv,kendall,random,8,1,0.5455447255899809,0.0614649096074132 aggregate,aggregate,Helm RAFT,helm_classic_240829.csv,kendall,random,8,2,0.4999999999999999,0.10868055555555556 aggregate,aggregate,Helm RAFT,helm_classic_240829.csv,kendall,random,8,3,0.6428571428571428,0.03115079365079365 aggregate,aggregate,Helm RAFT,helm_classic_240829.csv,kendall,random,8,4,0.5714285714285714,0.06101190476190476 aggregate,aggregate,MMLU Pro,mmlu_pro_240829.csv,kendall,random,8,0,0.7142857142857142,0.014136904761904762 aggregate,aggregate,MMLU Pro,mmlu_pro_240829.csv,kendall,random,8,1,0.9999999999999998,4.96031746031746e-05 aggregate,aggregate,MMLU Pro,mmlu_pro_240829.csv,kendall,random,8,2,0.7142857142857142,0.014136904761904762 aggregate,aggregate,MMLU Pro,mmlu_pro_240829.csv,kendall,random,8,3,0.5714285714285714,0.06101190476190476 aggregate,aggregate,MMLU Pro,mmlu_pro_240829.csv,kendall,random,8,4,0.5714285714285714,0.06101190476190476 aggregate,aggregate,MixEval,mixeval_240829.csv,kendall,random,8,0,0.7142857142857142,0.014136904761904762 aggregate,aggregate,MixEval,mixeval_240829.csv,kendall,random,8,1,0.6428571428571428,0.03115079365079365 aggregate,aggregate,MixEval,mixeval_240829.csv,kendall,random,8,2,0.7857142857142856,0.005505952380952381 aggregate,aggregate,MixEval,mixeval_240829.csv,kendall,random,8,3,0.6428571428571428,0.03115079365079365 aggregate,aggregate,MixEval,mixeval_240829.csv,kendall,random,8,4,0.9999999999999998,4.96031746031746e-05 aggregate,aggregate,MixEval Hard,mixeval_240829.csv,kendall,random,8,0,0.7142857142857142,0.014136904761904762 aggregate,aggregate,MixEval Hard,mixeval_240829.csv,kendall,random,8,1,0.3571428571428571,0.27509920634920637 aggregate,aggregate,MixEval Hard,mixeval_240829.csv,kendall,random,8,2,0.9285714285714285,0.0003968253968253968 aggregate,aggregate,MixEval Hard,mixeval_240829.csv,kendall,random,8,3,0.6428571428571428,0.03115079365079365 aggregate,aggregate,MixEval Hard,mixeval_240829.csv,kendall,random,8,4,0.4999999999999999,0.10868055555555556 aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,kendall,random,8,0,0.42857142857142855,0.17886904761904762 aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,kendall,random,8,1,0.5714285714285714,0.06101190476190476 aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,kendall,random,8,2,0.6428571428571428,0.03115079365079365 aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,kendall,random,8,3,0.4999999999999999,0.10868055555555556 aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,kendall,random,8,4,0.7857142857142856,0.005505952380952381 aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,kendall,random,8,0,0.7857142857142856,0.005505952380952381 aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,kendall,random,8,1,0.6428571428571428,0.03115079365079365 aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,kendall,random,8,2,0.9999999999999998,4.96031746031746e-05 aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,kendall,random,8,3,0.6428571428571428,0.03115079365079365 aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,kendall,random,8,4,0.8571428571428571,0.001736111111111111 aggregate,aggregate,MixEval DROP,mixeval_240829.csv,kendall,random,8,0,0.5714285714285714,0.06101190476190476 aggregate,aggregate,MixEval DROP,mixeval_240829.csv,kendall,random,8,1,0.4999999999999999,0.10868055555555556 aggregate,aggregate,MixEval DROP,mixeval_240829.csv,kendall,random,8,2,0.7142857142857142,0.014136904761904762 aggregate,aggregate,MixEval DROP,mixeval_240829.csv,kendall,random,8,3,0.4999999999999999,0.10868055555555556 aggregate,aggregate,MixEval DROP,mixeval_240829.csv,kendall,random,8,4,0.42857142857142855,0.17886904761904762 aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,kendall,random,8,0,0.5714285714285714,0.06101190476190476 aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,kendall,random,8,1,0.7142857142857142,0.014136904761904762 aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,kendall,random,8,2,0.6910233190806425,0.017844011512848347 aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,kendall,random,8,3,0.7857142857142856,0.005505952380952381 aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,kendall,random,8,4,0.5714285714285714,0.06101190476190476 aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,kendall,random,8,0,0.6428571428571428,0.03115079365079365 aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,kendall,random,8,1,0.4999999999999999,0.10868055555555556 aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,kendall,random,8,2,0.8571428571428571,0.001736111111111111 aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,kendall,random,8,3,0.6182840223353117,0.0340492747686748 aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,kendall,random,8,4,0.9285714285714285,0.0003968253968253968 aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,kendall,random,8,0,0.7142857142857142,0.014136904761904762 aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,kendall,random,8,1,0.3571428571428571,0.27509920634920637 aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,kendall,random,8,2,0.7142857142857142,0.014136904761904762 aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,kendall,random,8,3,0.7142857142857142,0.014136904761904762 aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,kendall,random,8,4,0.3571428571428571,0.27509920634920637 aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,kendall,random,8,0,0.4999999999999999,0.10868055555555556 aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,kendall,random,8,1,0.22237479499833035,0.45088703102517036 aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,kendall,random,8,2,0.7142857142857142,0.014136904761904762 aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,kendall,random,8,3,0.7857142857142856,0.005505952380952381 aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,kendall,random,8,4,0.2545875386086578,0.38281014365989596 aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,kendall,random,8,0,0.5455447255899809,0.0614649096074132 aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,kendall,random,8,1,0.2857142857142857,0.39875992063492066 aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,kendall,random,8,2,0.7637626158259734,0.008839740160738534 aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,kendall,random,8,3,0.5714285714285714,0.06101190476190476 aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,kendall,random,8,4,0.14285714285714285,0.7195436507936508 aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,kendall,random,8,0,0.9092412093166348,0.0018276750354536814 aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,kendall,random,8,1,0.7637626158259734,0.008839740160738534 aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,kendall,random,8,2,0.7142857142857142,0.014136904761904762 aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,kendall,random,8,3,0.5714285714285714,0.06101190476190476 aggregate,aggregate,AlphacaEval v2lc,alphacaeval_v2lc_240829.csv,kendall,random,8,4,0.9285714285714285,0.0003968253968253968 aggregate,aggregate,OpenCompass,opencompass_240829.csv,kendall,random,8,0,0.9285714285714285,0.0003968253968253968 aggregate,aggregate,OpenCompass,opencompass_240829.csv,kendall,random,8,1,0.7857142857142856,0.005505952380952381 aggregate,aggregate,OpenCompass,opencompass_240829.csv,kendall,random,8,2,0.7142857142857142,0.014136904761904762 aggregate,aggregate,OpenCompass,opencompass_240829.csv,kendall,random,8,3,0.7857142857142856,0.005505952380952381 aggregate,aggregate,OpenCompass,opencompass_240829.csv,kendall,random,8,4,0.8571428571428571,0.001736111111111111 aggregate,aggregate,OpenCompass Language,opencompass_240829.csv,kendall,random,8,0,0.4999999999999999,0.10868055555555556 aggregate,aggregate,OpenCompass Language,opencompass_240829.csv,kendall,random,8,1,0.0,1.0 aggregate,aggregate,OpenCompass Language,opencompass_240829.csv,kendall,random,8,2,0.3571428571428571,0.27509920634920637 aggregate,aggregate,OpenCompass Language,opencompass_240829.csv,kendall,random,8,3,0.42857142857142855,0.17886904761904762 aggregate,aggregate,OpenCompass Language,opencompass_240829.csv,kendall,random,8,4,0.21428571428571427,0.5484126984126985 aggregate,aggregate,OpenCompass Knowledge,opencompass_240829.csv,kendall,random,8,0,0.5714285714285714,0.06101190476190476 aggregate,aggregate,OpenCompass Knowledge,opencompass_240829.csv,kendall,random,8,1,0.7857142857142856,0.005505952380952381 aggregate,aggregate,OpenCompass Knowledge,opencompass_240829.csv,kendall,random,8,2,0.7142857142857142,0.014136904761904762 aggregate,aggregate,OpenCompass Knowledge,opencompass_240829.csv,kendall,random,8,3,0.2857142857142857,0.39875992063492066 aggregate,aggregate,OpenCompass Knowledge,opencompass_240829.csv,kendall,random,8,4,0.7857142857142856,0.005505952380952381 aggregate,aggregate,OpenCompass Reasoning,opencompass_240829.csv,kendall,random,8,0,0.7142857142857142,0.014136904761904762 aggregate,aggregate,OpenCompass Reasoning,opencompass_240829.csv,kendall,random,8,1,0.4999999999999999,0.10868055555555556 aggregate,aggregate,OpenCompass Reasoning,opencompass_240829.csv,kendall,random,8,2,0.4999999999999999,0.10868055555555556 aggregate,aggregate,OpenCompass Reasoning,opencompass_240829.csv,kendall,random,8,3,0.7142857142857142,0.014136904761904762 aggregate,aggregate,OpenCompass Reasoning,opencompass_240829.csv,kendall,random,8,4,0.42857142857142855,0.17886904761904762 aggregate,aggregate,OpenCompass Math,opencompass_240829.csv,kendall,random,8,0,0.9285714285714285,0.0003968253968253968 aggregate,aggregate,OpenCompass Math,opencompass_240829.csv,kendall,random,8,1,0.7142857142857142,0.014136904761904762 aggregate,aggregate,OpenCompass Math,opencompass_240829.csv,kendall,random,8,2,0.7857142857142856,0.005505952380952381 aggregate,aggregate,OpenCompass Math,opencompass_240829.csv,kendall,random,8,3,0.7142857142857142,0.014136904761904762 aggregate,aggregate,OpenCompass Math,opencompass_240829.csv,kendall,random,8,4,0.7857142857142856,0.005505952380952381 aggregate,aggregate,OpenCompass Code,opencompass_240829.csv,kendall,random,8,0,0.8571428571428571,0.001736111111111111 aggregate,aggregate,OpenCompass Code,opencompass_240829.csv,kendall,random,8,1,0.5714285714285714,0.06101190476190476 aggregate,aggregate,OpenCompass Code,opencompass_240829.csv,kendall,random,8,2,0.47280542884465016,0.10506382347888965 aggregate,aggregate,OpenCompass Code,opencompass_240829.csv,kendall,random,8,3,0.7142857142857142,0.014136904761904762 aggregate,aggregate,OpenCompass Code,opencompass_240829.csv,kendall,random,8,4,0.7857142857142856,0.005505952380952381 aggregate,aggregate,OpenCompass Instruction,opencompass_240829.csv,kendall,random,8,0,0.4999999999999999,0.10868055555555556 aggregate,aggregate,OpenCompass Instruction,opencompass_240829.csv,kendall,random,8,1,0.7142857142857142,0.014136904761904762 aggregate,aggregate,OpenCompass Instruction,opencompass_240829.csv,kendall,random,8,2,0.6428571428571428,0.03115079365079365 aggregate,aggregate,OpenCompass Instruction,opencompass_240829.csv,kendall,random,8,3,0.7142857142857142,0.014136904761904762 aggregate,aggregate,OpenCompass Instruction,opencompass_240829.csv,kendall,random,8,4,0.8571428571428571,0.001736111111111111 aggregate,aggregate,OpenCompass Agent,opencompass_240829.csv,kendall,random,8,0,0.3571428571428571,0.27509920634920637 aggregate,aggregate,OpenCompass Agent,opencompass_240829.csv,kendall,random,8,1,0.42857142857142855,0.17886904761904762 aggregate,aggregate,OpenCompass Agent,opencompass_240829.csv,kendall,random,8,2,0.2857142857142857,0.39875992063492066 aggregate,aggregate,OpenCompass Agent,opencompass_240829.csv,kendall,random,8,3,0.21428571428571427,0.5484126984126985 aggregate,aggregate,OpenCompass Agent,opencompass_240829.csv,kendall,random,8,4,0.7142857142857142,0.014136904761904762 aggregate,aggregate,OpenCompass Arena,opencompass_arena_240829.csv,kendall,random,8,0,0.3571428571428571,0.27509920634920637 aggregate,aggregate,OpenCompass Arena,opencompass_arena_240829.csv,kendall,random,8,1,0.14285714285714285,0.7195436507936508 aggregate,aggregate,OpenCompass Arena,opencompass_arena_240829.csv,kendall,random,8,2,0.3571428571428571,0.27509920634920637 aggregate,aggregate,OpenCompass Arena,opencompass_arena_240829.csv,kendall,random,8,3,0.4999999999999999,0.10868055555555556 aggregate,aggregate,OpenCompass Arena,opencompass_arena_240829.csv,kendall,random,8,4,0.2857142857142857,0.39875992063492066 aggregate,aggregate,LiveBench 240725,livebench_240829.csv,kendall,random,8,0,0.7142857142857142,0.014136904761904762 aggregate,aggregate,LiveBench 240725,livebench_240829.csv,kendall,random,8,1,0.7857142857142856,0.005505952380952381 aggregate,aggregate,LiveBench 240725,livebench_240829.csv,kendall,random,8,2,0.7857142857142856,0.005505952380952381 aggregate,aggregate,LiveBench 240725,livebench_240829.csv,kendall,random,8,3,0.7857142857142856,0.005505952380952381 aggregate,aggregate,LiveBench 240725,livebench_240829.csv,kendall,random,8,4,0.9285714285714285,0.0003968253968253968 aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,kendall,random,8,0,0.6910233190806425,0.017844011512848347 aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,kendall,random,8,1,0.836501912571304,0.004136737098676645 aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,kendall,random,8,2,0.5455447255899809,0.0614649096074132 aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,kendall,random,8,3,0.7857142857142856,0.005505952380952381 aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,kendall,random,8,4,0.9285714285714285,0.0003968253968253968 aggregate,aggregate,LiveBench Coding,livebench_240829.csv,kendall,random,8,0,0.9999999999999998,4.96031746031746e-05 aggregate,aggregate,LiveBench Coding,livebench_240829.csv,kendall,random,8,1,0.7142857142857142,0.014136904761904762 aggregate,aggregate,LiveBench Coding,livebench_240829.csv,kendall,random,8,2,0.9285714285714285,0.0003968253968253968 aggregate,aggregate,LiveBench Coding,livebench_240829.csv,kendall,random,8,3,0.7857142857142856,0.005505952380952381 aggregate,aggregate,LiveBench Coding,livebench_240829.csv,kendall,random,8,4,0.7857142857142856,0.005505952380952381 aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,kendall,random,8,0,0.9285714285714285,0.0003968253968253968 aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,kendall,random,8,1,0.7857142857142856,0.005505952380952381 aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,kendall,random,8,2,0.7857142857142856,0.005505952380952381 aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,kendall,random,8,3,0.5714285714285714,0.06101190476190476 aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,kendall,random,8,4,0.7857142857142856,0.005505952380952381 aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,kendall,random,8,0,0.9819805060619657,0.0007619896395304237 aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,kendall,random,8,1,0.7142857142857142,0.014136904761904762 aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,kendall,random,8,2,0.6428571428571428,0.03115079365079365 aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,kendall,random,8,3,0.7142857142857142,0.014136904761904762 aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,kendall,random,8,4,0.7142857142857142,0.014136904761904762 aggregate,aggregate,LiveBench Language,livebench_240829.csv,kendall,random,8,0,0.7857142857142856,0.005505952380952381 aggregate,aggregate,LiveBench Language,livebench_240829.csv,kendall,random,8,1,0.6428571428571428,0.03115079365079365 aggregate,aggregate,LiveBench Language,livebench_240829.csv,kendall,random,8,2,0.5714285714285714,0.06101190476190476 aggregate,aggregate,LiveBench Language,livebench_240829.csv,kendall,random,8,3,0.7857142857142856,0.005505952380952381 aggregate,aggregate,LiveBench Language,livebench_240829.csv,kendall,random,8,4,0.6428571428571428,0.03115079365079365 aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,kendall,random,8,0,0.6428571428571428,0.03115079365079365 aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,kendall,random,8,1,0.4999999999999999,0.10868055555555556 aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,kendall,random,8,2,0.7857142857142856,0.005505952380952381 aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,kendall,random,8,3,0.6428571428571428,0.03115079365079365 aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,kendall,random,8,4,0.7142857142857142,0.014136904761904762 aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,kendall,random,8,0,0.9285714285714285,0.0003968253968253968 aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,kendall,random,8,1,0.9999999999999998,4.96031746031746e-05 aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,kendall,random,8,2,0.9285714285714285,0.0003968253968253968 aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,kendall,random,8,3,0.9285714285714285,0.0003968253968253968 aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,kendall,random,8,4,0.9285714285714285,0.0003968253968253968 aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,kendall,random,8,0,0.6428571428571428,0.03115079365079365 aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,kendall,random,8,1,0.7857142857142856,0.005505952380952381 aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,kendall,random,8,2,0.7857142857142856,0.005505952380952381 aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,kendall,random,8,3,0.9285714285714285,0.0003968253968253968 aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,kendall,random,8,4,0.6428571428571428,0.03115079365079365 aggregate,aggregate,WildBench Creative,wildbench_240829.csv,kendall,random,8,0,0.7142857142857142,0.014136904761904762 aggregate,aggregate,WildBench Creative,wildbench_240829.csv,kendall,random,8,1,0.7857142857142856,0.005505952380952381 aggregate,aggregate,WildBench Creative,wildbench_240829.csv,kendall,random,8,2,0.7142857142857142,0.014136904761904762 aggregate,aggregate,WildBench Creative,wildbench_240829.csv,kendall,random,8,3,0.9285714285714285,0.0003968253968253968 aggregate,aggregate,WildBench Creative,wildbench_240829.csv,kendall,random,8,4,0.4999999999999999,0.10868055555555556 aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,kendall,random,8,0,0.8571428571428571,0.001736111111111111 aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,kendall,random,8,1,0.8571428571428571,0.001736111111111111 aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,kendall,random,8,2,0.7857142857142856,0.005505952380952381 aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,kendall,random,8,3,0.9999999999999998,4.96031746031746e-05 aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,kendall,random,8,4,0.9999999999999998,4.96031746031746e-05 aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,kendall,random,8,0,0.9999999999999998,4.96031746031746e-05 aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,kendall,random,8,1,0.7857142857142856,0.005505952380952381 aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,kendall,random,8,2,0.7142857142857142,0.014136904761904762 aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,kendall,random,8,3,0.8571428571428571,0.001736111111111111 aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,kendall,random,8,4,0.9999999999999998,4.96031746031746e-05 aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,kendall,random,8,0,0.7857142857142856,0.005505952380952381 aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,kendall,random,8,1,0.7857142857142856,0.005505952380952381 aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,kendall,random,8,2,0.7142857142857142,0.014136904761904762 aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,kendall,random,8,3,0.9999999999999998,4.96031746031746e-05 aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,kendall,random,8,4,0.9285714285714285,0.0003968253968253968 aggregate,aggregate,WildBench Score,wildbench_240829.csv,kendall,random,8,0,0.7857142857142856,0.005505952380952381 aggregate,aggregate,WildBench Score,wildbench_240829.csv,kendall,random,8,1,0.7857142857142856,0.005505952380952381 aggregate,aggregate,WildBench Score,wildbench_240829.csv,kendall,random,8,2,0.7142857142857142,0.014136904761904762 aggregate,aggregate,WildBench Score,wildbench_240829.csv,kendall,random,8,3,0.9999999999999998,4.96031746031746e-05 aggregate,aggregate,WildBench Score,wildbench_240829.csv,kendall,random,8,4,0.9999999999999998,4.96031746031746e-05 aggregate,aggregate,Arena Hard,arena_hard_240829.csv,kendall,random,8,0,0.8571428571428571,0.001736111111111111 aggregate,aggregate,Arena Hard,arena_hard_240829.csv,kendall,random,8,1,0.8571428571428571,0.001736111111111111 aggregate,aggregate,Arena Hard,arena_hard_240829.csv,kendall,random,8,2,0.8571428571428571,0.001736111111111111 aggregate,aggregate,Arena Hard,arena_hard_240829.csv,kendall,random,8,3,0.8571428571428571,0.001736111111111111 aggregate,aggregate,Arena Hard,arena_hard_240829.csv,kendall,random,8,4,0.7857142857142856,0.005505952380952381 aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,0,0.7142857142857142,0.014136904761904762 aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,1,0.7857142857142856,0.005505952380952381 aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,2,0.7142857142857142,0.014136904761904762 aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,3,0.3571428571428571,0.27509920634920637 aggregate,aggregate,HF OpenLLM v1,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,4,0.42857142857142855,0.17886904761904762 aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,0,0.7857142857142856,0.005505952380952381 aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,1,0.7857142857142856,0.005505952380952381 aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,2,0.5455447255899809,0.0614649096074132 aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,3,0.2857142857142857,0.39875992063492066 aggregate,aggregate,HFv1 ARC,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,4,0.4999999999999999,0.10868055555555556 aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,0,0.6428571428571428,0.03115079365079365 aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,1,0.5714285714285714,0.06101190476190476 aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,2,0.5714285714285714,0.06101190476190476 aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,3,0.42857142857142855,0.17886904761904762 aggregate,aggregate,HFv1 GSM8K,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,4,0.21428571428571427,0.5484126984126985 aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,0,0.7857142857142856,0.005505952380952381 aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,1,0.7142857142857142,0.014136904761904762 aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,2,0.5714285714285714,0.06101190476190476 aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,3,0.14285714285714285,0.7195436507936508 aggregate,aggregate,HFv1 HellaSwag,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,4,0.4999999999999999,0.10868055555555556 aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,0,0.5714285714285714,0.06101190476190476 aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,1,0.8571428571428571,0.001736111111111111 aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,2,0.7857142857142856,0.005505952380952381 aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,3,0.21428571428571427,0.5484126984126985 aggregate,aggregate,HFv1 MMLU,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,4,0.5714285714285714,0.06101190476190476 aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,0,0.42857142857142855,0.17886904761904762 aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,1,0.8571428571428571,0.001736111111111111 aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,2,0.2857142857142857,0.39875992063492066 aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,3,-0.07142857142857142,0.9048611111111111 aggregate,aggregate,HFv1 TruthfulQA,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,4,0.21428571428571427,0.5484126984126985 aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,0,0.7142857142857142,0.014136904761904762 aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,1,0.6428571428571428,0.03115079365079365 aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,2,0.42857142857142855,0.17886904761904762 aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,3,0.14285714285714285,0.7195436507936508 aggregate,aggregate,HFv1 Winogrande,hf_open_llm_v1_240829_frozen.csv,kendall,random,8,4,0.47280542884465016,0.10506382347888965 aggregate,aggregate,BFCL,bfcl_240906.csv,kendall,random,8,0,0.42857142857142855,0.17886904761904762 aggregate,aggregate,BFCL,bfcl_240906.csv,kendall,random,8,1,0.5714285714285714,0.06101190476190476 aggregate,aggregate,BFCL,bfcl_240906.csv,kendall,random,8,2,0.5714285714285714,0.06101190476190476 aggregate,aggregate,BFCL,bfcl_240906.csv,kendall,random,8,3,0.2857142857142857,0.39875992063492066 aggregate,aggregate,BFCL,bfcl_240906.csv,kendall,random,8,4,0.4999999999999999,0.10868055555555556 aggregate,aggregate,BIGGEN,biggen_240829.csv,kendall,random,8,0,0.4999999999999999,0.10868055555555556 aggregate,aggregate,BIGGEN,biggen_240829.csv,kendall,random,8,1,0.6428571428571428,0.03115079365079365 aggregate,aggregate,BIGGEN,biggen_240829.csv,kendall,random,8,2,0.5714285714285714,0.06101190476190476 aggregate,aggregate,BIGGEN,biggen_240829.csv,kendall,random,8,3,0.21428571428571427,0.5484126984126985 aggregate,aggregate,BIGGEN,biggen_240829.csv,kendall,random,8,4,-0.3571428571428571,0.27509920634920637 aggregate,aggregate,BIGGEN Grounding,biggen_240829.csv,kendall,random,8,0,0.9285714285714285,0.0003968253968253968 aggregate,aggregate,BIGGEN Grounding,biggen_240829.csv,kendall,random,8,1,0.7857142857142856,0.005505952380952381 aggregate,aggregate,BIGGEN Grounding,biggen_240829.csv,kendall,random,8,2,0.47280542884465016,0.10506382347888965 aggregate,aggregate,BIGGEN Grounding,biggen_240829.csv,kendall,random,8,3,0.3571428571428571,0.27509920634920637 aggregate,aggregate,BIGGEN Grounding,biggen_240829.csv,kendall,random,8,4,0.18184824186332696,0.5330356744917513 aggregate,aggregate,BIGGEN Instruction Following,biggen_240829.csv,kendall,random,8,0,0.5714285714285714,0.06101190476190476 aggregate,aggregate,BIGGEN Instruction Following,biggen_240829.csv,kendall,random,8,1,0.6428571428571428,0.03115079365079365 aggregate,aggregate,BIGGEN Instruction Following,biggen_240829.csv,kendall,random,8,2,0.5714285714285714,0.06101190476190476 aggregate,aggregate,BIGGEN Instruction Following,biggen_240829.csv,kendall,random,8,3,0.10910894511799618,0.7083840532183997 aggregate,aggregate,BIGGEN Instruction Following,biggen_240829.csv,kendall,random,8,4,-0.40006613209931935,0.17023995462900499 aggregate,aggregate,BIGGEN Planning,biggen_240829.csv,kendall,random,8,0,0.47280542884465016,0.10506382347888965 aggregate,aggregate,BIGGEN Planning,biggen_240829.csv,kendall,random,8,1,0.6182840223353117,0.0340492747686748 aggregate,aggregate,BIGGEN Planning,biggen_240829.csv,kendall,random,8,2,0.5455447255899809,0.0614649096074132 aggregate,aggregate,BIGGEN Planning,biggen_240829.csv,kendall,random,8,3,0.21428571428571427,0.5484126984126985 aggregate,aggregate,BIGGEN Planning,biggen_240829.csv,kendall,random,8,4,-0.42857142857142855,0.17886904761904762 aggregate,aggregate,BIGGEN Reasoning,biggen_240829.csv,kendall,random,8,0,0.5714285714285714,0.06101190476190476 aggregate,aggregate,BIGGEN Reasoning,biggen_240829.csv,kendall,random,8,1,0.7857142857142856,0.005505952380952381 aggregate,aggregate,BIGGEN Reasoning,biggen_240829.csv,kendall,random,8,2,0.5714285714285714,0.06101190476190476 aggregate,aggregate,BIGGEN Reasoning,biggen_240829.csv,kendall,random,8,3,0.2857142857142857,0.39875992063492066 aggregate,aggregate,BIGGEN Reasoning,biggen_240829.csv,kendall,random,8,4,0.0,1.0 aggregate,aggregate,BIGGEN Refinement,biggen_240829.csv,kendall,random,8,0,0.6428571428571428,0.03115079365079365 aggregate,aggregate,BIGGEN Refinement,biggen_240829.csv,kendall,random,8,1,0.5714285714285714,0.06101190476190476 aggregate,aggregate,BIGGEN Refinement,biggen_240829.csv,kendall,random,8,2,0.5714285714285714,0.06101190476190476 aggregate,aggregate,BIGGEN Refinement,biggen_240829.csv,kendall,random,8,3,0.21428571428571427,0.5484126984126985 aggregate,aggregate,BIGGEN Refinement,biggen_240829.csv,kendall,random,8,4,-0.07142857142857142,0.9048611111111111 aggregate,aggregate,BIGGEN Safety,biggen_240829.csv,kendall,random,8,0,0.2857142857142857,0.39875992063492066 aggregate,aggregate,BIGGEN Safety,biggen_240829.csv,kendall,random,8,1,0.42857142857142855,0.17886904761904762 aggregate,aggregate,BIGGEN Safety,biggen_240829.csv,kendall,random,8,2,0.14285714285714285,0.7195436507936508 aggregate,aggregate,BIGGEN Safety,biggen_240829.csv,kendall,random,8,3,-0.14285714285714285,0.7195436507936508 aggregate,aggregate,BIGGEN Safety,biggen_240829.csv,kendall,random,8,4,-0.42857142857142855,0.17886904761904762 aggregate,aggregate,BIGGEN Theory of Mind,biggen_240829.csv,kendall,random,8,0,0.42857142857142855,0.17886904761904762 aggregate,aggregate,BIGGEN Theory of Mind,biggen_240829.csv,kendall,random,8,1,0.5714285714285714,0.06101190476190476 aggregate,aggregate,BIGGEN Theory of Mind,biggen_240829.csv,kendall,random,8,2,0.42857142857142855,0.17886904761904762 aggregate,aggregate,BIGGEN Theory of Mind,biggen_240829.csv,kendall,random,8,3,0.21428571428571427,0.5484126984126985 aggregate,aggregate,BIGGEN Theory of Mind,biggen_240829.csv,kendall,random,8,4,-0.2545875386086578,0.38281014365989596 aggregate,aggregate,BIGGEN Tool Usage,biggen_240829.csv,kendall,random,8,0,0.6428571428571428,0.03115079365079365 aggregate,aggregate,BIGGEN Tool Usage,biggen_240829.csv,kendall,random,8,1,0.6428571428571428,0.03115079365079365 aggregate,aggregate,BIGGEN Tool Usage,biggen_240829.csv,kendall,random,8,2,0.6428571428571428,0.03115079365079365 aggregate,aggregate,BIGGEN Tool Usage,biggen_240829.csv,kendall,random,8,3,0.18184824186332696,0.5330356744917513 aggregate,aggregate,BIGGEN Tool Usage,biggen_240829.csv,kendall,random,8,4,-0.21428571428571427,0.5484126984126985 aggregate,aggregate,BIGGEN Multilingual,biggen_240829.csv,kendall,random,8,0,0.4999999999999999,0.10868055555555556 aggregate,aggregate,BIGGEN Multilingual,biggen_240829.csv,kendall,random,8,1,0.5714285714285714,0.06101190476190476 aggregate,aggregate,BIGGEN Multilingual,biggen_240829.csv,kendall,random,8,2,0.7637626158259734,0.008839740160738534 aggregate,aggregate,BIGGEN Multilingual,biggen_240829.csv,kendall,random,8,3,0.6910233190806425,0.017844011512848347 aggregate,aggregate,BIGGEN Multilingual,biggen_240829.csv,kendall,random,8,4,0.6428571428571428,0.03115079365079365 aggregate,aggregate,LiveBench 240624,livebench_240701.csv,kendall,random,8,0,0.9285714285714285,0.0003968253968253968 aggregate,aggregate,LiveBench 240624,livebench_240701.csv,kendall,random,8,1,0.8571428571428571,0.001736111111111111 aggregate,aggregate,LiveBench 240624,livebench_240701.csv,kendall,random,8,2,0.8571428571428571,0.001736111111111111 aggregate,aggregate,LiveBench 240624,livebench_240701.csv,kendall,random,8,3,0.7857142857142856,0.005505952380952381 aggregate,aggregate,LiveBench 240624,livebench_240701.csv,kendall,random,8,4,0.8571428571428571,0.001736111111111111 aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,kendall,random,8,0,0.9285714285714285,0.0003968253968253968 aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,kendall,random,8,1,0.8571428571428571,0.001736111111111111 aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,kendall,random,8,2,0.6910233190806425,0.017844011512848347 aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,kendall,random,8,3,0.836501912571304,0.004136737098676645 aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,kendall,random,8,4,0.9285714285714285,0.0003968253968253968 aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,kendall,random,8,0,0.9999999999999998,4.96031746031746e-05 aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,kendall,random,8,1,0.9999999999999998,4.96031746031746e-05 aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,kendall,random,8,2,0.6428571428571428,0.03115079365079365 aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,kendall,random,8,3,0.7637626158259734,0.008839740160738534 aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,kendall,random,8,4,0.7142857142857142,0.014136904761904762 aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,kendall,random,8,0,0.9999999999999998,4.96031746031746e-05 aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,kendall,random,8,1,0.9999999999999998,4.96031746031746e-05 aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,kendall,random,8,2,0.7857142857142856,0.005505952380952381 aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,kendall,random,8,3,0.7857142857142856,0.005505952380952381 aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,kendall,random,8,4,0.7857142857142856,0.005505952380952381 aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,kendall,random,8,0,0.9999999999999998,4.96031746031746e-05 aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,kendall,random,8,1,0.8571428571428571,0.001736111111111111 aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,kendall,random,8,2,0.6428571428571428,0.03115079365079365 aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,kendall,random,8,3,0.6182840223353117,0.0340492747686748 aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,kendall,random,8,4,0.7857142857142856,0.005505952380952381 aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,kendall,random,8,0,0.7857142857142856,0.005505952380952381 aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,kendall,random,8,1,0.7142857142857142,0.014136904761904762 aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,kendall,random,8,2,0.7857142857142856,0.005505952380952381 aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,kendall,random,8,3,0.6428571428571428,0.03115079365079365 aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,kendall,random,8,4,0.8571428571428571,0.001736111111111111 aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,kendall,random,8,0,0.8571428571428571,0.001736111111111111 aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,kendall,random,8,1,0.4999999999999999,0.10868055555555556 aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,kendall,random,8,2,0.6428571428571428,0.03115079365079365 aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,kendall,random,8,3,0.6428571428571428,0.03115079365079365 aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,kendall,random,8,4,0.7142857142857142,0.014136904761904762