Spaces:
Sleeping
Sleeping
| scenario,scenario_source,ref_scenario,ref_source,corr_type,model_select_strategy,model_subset_size_requested,exp_n,correlation,p_value | |
| HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,0,0.8571428571428571,0.001736111111111111 | |
| HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,1,0.8571428571428571,0.001736111111111111 | |
| HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,2,0.8571428571428571,0.001736111111111111 | |
| HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,3,0.8571428571428571,0.001736111111111111 | |
| HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,4,0.8571428571428571,0.001736111111111111 | |
| HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,5,0.8571428571428571,0.001736111111111111 | |
| HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,6,0.8571428571428571,0.001736111111111111 | |
| HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,7,0.8571428571428571,0.001736111111111111 | |
| HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,8,0.8571428571428571,0.001736111111111111 | |
| HF OpenLLM v2,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,9,0.8571428571428571,0.001736111111111111 | |
| HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,0,0.8571428571428571,0.001736111111111111 | |
| HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,1,0.8571428571428571,0.001736111111111111 | |
| HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,2,0.8571428571428571,0.001736111111111111 | |
| HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,3,0.8571428571428571,0.001736111111111111 | |
| HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,4,0.8571428571428571,0.001736111111111111 | |
| HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,5,0.8571428571428571,0.001736111111111111 | |
| HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,6,0.8571428571428571,0.001736111111111111 | |
| HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,7,0.8571428571428571,0.001736111111111111 | |
| HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,8,0.8571428571428571,0.001736111111111111 | |
| HFv2 BBH,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,9,0.8571428571428571,0.001736111111111111 | |
| HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,0,0.4999999999999999,0.10868055555555556 | |
| HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,1,0.4999999999999999,0.10868055555555556 | |
| HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,2,0.4999999999999999,0.10868055555555556 | |
| HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,3,0.4999999999999999,0.10868055555555556 | |
| HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,4,0.4999999999999999,0.10868055555555556 | |
| HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,5,0.4999999999999999,0.10868055555555556 | |
| HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,6,0.4999999999999999,0.10868055555555556 | |
| HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,7,0.4999999999999999,0.10868055555555556 | |
| HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,8,0.4999999999999999,0.10868055555555556 | |
| HFv2 GPQA,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,9,0.4999999999999999,0.10868055555555556 | |
| HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7857142857142856,0.005505952380952381 | |
| HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7857142857142856,0.005505952380952381 | |
| HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7857142857142856,0.005505952380952381 | |
| HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,3,0.7857142857142856,0.005505952380952381 | |
| HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,4,0.7857142857142856,0.005505952380952381 | |
| HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,5,0.7857142857142856,0.005505952380952381 | |
| HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,6,0.7857142857142856,0.005505952380952381 | |
| HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,7,0.7857142857142856,0.005505952380952381 | |
| HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,8,0.7857142857142856,0.005505952380952381 | |
| HFv2 IFEval,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,9,0.7857142857142856,0.005505952380952381 | |
| HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,0,0.8571428571428571,0.001736111111111111 | |
| HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,1,0.8571428571428571,0.001736111111111111 | |
| HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,2,0.8571428571428571,0.001736111111111111 | |
| HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,3,0.8571428571428571,0.001736111111111111 | |
| HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,4,0.8571428571428571,0.001736111111111111 | |
| HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,5,0.8571428571428571,0.001736111111111111 | |
| HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,6,0.8571428571428571,0.001736111111111111 | |
| HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,7,0.8571428571428571,0.001736111111111111 | |
| HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,8,0.8571428571428571,0.001736111111111111 | |
| HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,9,0.8571428571428571,0.001736111111111111 | |
| HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,0,0.3706246583305506,0.20891238174069848 | |
| HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,1,0.3706246583305506,0.20891238174069848 | |
| HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,2,0.3706246583305506,0.20891238174069848 | |
| HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,3,0.3706246583305506,0.20891238174069848 | |
| HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,4,0.3706246583305506,0.20891238174069848 | |
| HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,5,0.3706246583305506,0.20891238174069848 | |
| HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,6,0.3706246583305506,0.20891238174069848 | |
| HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,7,0.3706246583305506,0.20891238174069848 | |
| HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,8,0.3706246583305506,0.20891238174069848 | |
| HFv2 Math Level 5,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,9,0.3706246583305506,0.20891238174069848 | |
| HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,0,0.5714285714285714,0.06101190476190476 | |
| HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,1,0.5714285714285714,0.06101190476190476 | |
| HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,2,0.5714285714285714,0.06101190476190476 | |
| HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,3,0.5714285714285714,0.06101190476190476 | |
| HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,4,0.5714285714285714,0.06101190476190476 | |
| HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,5,0.5714285714285714,0.06101190476190476 | |
| HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,6,0.5714285714285714,0.06101190476190476 | |
| HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,7,0.5714285714285714,0.06101190476190476 | |
| HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,8,0.5714285714285714,0.06101190476190476 | |
| HFv2 MuSR,hf_open_llm_v2_240829.csv,aggregate,aggregate,kendall,random,8,9,0.5714285714285714,0.06101190476190476 | |
| LMSys Arena,chatbot_arena_240829.csv,aggregate,aggregate,kendall,random,8,0,0.9285714285714285,0.0003968253968253968 | |
| LMSys Arena,chatbot_arena_240829.csv,aggregate,aggregate,kendall,random,8,1,0.9285714285714285,0.0003968253968253968 | |
| LMSys Arena,chatbot_arena_240829.csv,aggregate,aggregate,kendall,random,8,2,0.8571428571428571,0.001736111111111111 | |
| LMSys Arena,chatbot_arena_240829.csv,aggregate,aggregate,kendall,random,8,3,0.8571428571428571,0.001736111111111111 | |
| LMSys Arena,chatbot_arena_240829.csv,aggregate,aggregate,kendall,random,8,4,0.8571428571428571,0.001736111111111111 | |
| LMSys Arena,chatbot_arena_240829.csv,aggregate,aggregate,kendall,random,8,5,0.7857142857142856,0.005505952380952381 | |
| LMSys Arena,chatbot_arena_240829.csv,aggregate,aggregate,kendall,random,8,6,0.8571428571428571,0.001736111111111111 | |
| LMSys Arena,chatbot_arena_240829.csv,aggregate,aggregate,kendall,random,8,7,0.8571428571428571,0.001736111111111111 | |
| LMSys Arena,chatbot_arena_240829.csv,aggregate,aggregate,kendall,random,8,8,0.8571428571428571,0.001736111111111111 | |
| LMSys Arena,chatbot_arena_240829.csv,aggregate,aggregate,kendall,random,8,9,0.8571428571428571,0.001736111111111111 | |
| MixEval,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,0,0.6428571428571428,0.03115079365079365 | |
| MixEval,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7857142857142856,0.005505952380952381 | |
| MixEval,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7857142857142856,0.005505952380952381 | |
| MixEval,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,3,0.6428571428571428,0.03115079365079365 | |
| MixEval,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,4,0.6428571428571428,0.03115079365079365 | |
| MixEval,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,5,0.7857142857142856,0.005505952380952381 | |
| MixEval,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,6,0.7857142857142856,0.005505952380952381 | |
| MixEval,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,7,0.6428571428571428,0.03115079365079365 | |
| MixEval,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,8,0.7142857142857142,0.014136904761904762 | |
| MixEval,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,9,0.7857142857142856,0.005505952380952381 | |
| MixEval Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,0,0.9285714285714285,0.0003968253968253968 | |
| MixEval Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,1,0.9285714285714285,0.0003968253968253968 | |
| MixEval Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,2,0.9285714285714285,0.0003968253968253968 | |
| MixEval Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,3,0.9285714285714285,0.0003968253968253968 | |
| MixEval Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,4,0.9285714285714285,0.0003968253968253968 | |
| MixEval Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,5,0.9285714285714285,0.0003968253968253968 | |
| MixEval Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,6,0.9285714285714285,0.0003968253968253968 | |
| MixEval Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,7,0.9285714285714285,0.0003968253968253968 | |
| MixEval Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,8,0.9999999999999998,4.96031746031746e-05 | |
| MixEval Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,9,0.9285714285714285,0.0003968253968253968 | |
| MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,0,0.6428571428571428,0.03115079365079365 | |
| MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,1,0.6428571428571428,0.03115079365079365 | |
| MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,2,0.6428571428571428,0.03115079365079365 | |
| MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,3,0.6428571428571428,0.03115079365079365 | |
| MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,4,0.6428571428571428,0.03115079365079365 | |
| MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,5,0.7142857142857142,0.014136904761904762 | |
| MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,6,0.6428571428571428,0.03115079365079365 | |
| MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,7,0.6428571428571428,0.03115079365079365 | |
| MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,8,0.7142857142857142,0.014136904761904762 | |
| MixEval TriviaQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,9,0.6428571428571428,0.03115079365079365 | |
| MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7142857142857142,0.014136904761904762 | |
| MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,1,0.8571428571428571,0.001736111111111111 | |
| MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,2,0.8571428571428571,0.001736111111111111 | |
| MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,3,0.7142857142857142,0.014136904761904762 | |
| MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,4,0.7142857142857142,0.014136904761904762 | |
| MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,5,0.8571428571428571,0.001736111111111111 | |
| MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,6,0.8571428571428571,0.001736111111111111 | |
| MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,7,0.7142857142857142,0.014136904761904762 | |
| MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,8,0.7142857142857142,0.014136904761904762 | |
| MixEval MMLU,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,9,0.8571428571428571,0.001736111111111111 | |
| MixEval DROP,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7142857142857142,0.014136904761904762 | |
| MixEval DROP,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,1,0.6428571428571428,0.03115079365079365 | |
| MixEval DROP,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7142857142857142,0.014136904761904762 | |
| MixEval DROP,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,3,0.6428571428571428,0.03115079365079365 | |
| MixEval DROP,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,4,0.6428571428571428,0.03115079365079365 | |
| MixEval DROP,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,5,0.7857142857142856,0.005505952380952381 | |
| MixEval DROP,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,6,0.6428571428571428,0.03115079365079365 | |
| MixEval DROP,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,7,0.6428571428571428,0.03115079365079365 | |
| MixEval DROP,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,8,0.7142857142857142,0.014136904761904762 | |
| MixEval DROP,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,9,0.7142857142857142,0.014136904761904762 | |
| MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,0,0.6428571428571428,0.03115079365079365 | |
| MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,1,0.5714285714285714,0.06101190476190476 | |
| MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,2,0.6428571428571428,0.03115079365079365 | |
| MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,3,0.42857142857142855,0.17886904761904762 | |
| MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,4,0.42857142857142855,0.17886904761904762 | |
| MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,5,0.5714285714285714,0.06101190476190476 | |
| MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,6,0.5714285714285714,0.06101190476190476 | |
| MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,7,0.42857142857142855,0.17886904761904762 | |
| MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,8,0.5714285714285714,0.06101190476190476 | |
| MixEval HellaSwag,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,9,0.6428571428571428,0.03115079365079365 | |
| MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,0,0.836501912571304,0.004136737098676645 | |
| MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,1,0.6910233190806425,0.017844011512848347 | |
| MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,2,0.6910233190806425,0.017844011512848347 | |
| MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,3,0.5929994533288809,0.04437842734548688 | |
| MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,4,0.5929994533288809,0.04437842734548688 | |
| MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,5,0.6671243849949912,0.02370907646413876 | |
| MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,6,0.6910233190806425,0.017844011512848347 | |
| MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,7,0.5929994533288809,0.04437842734548688 | |
| MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,8,0.6671243849949912,0.02370907646413876 | |
| MixEval CommonsenseQA,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,9,0.6910233190806425,0.017844011512848347 | |
| MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7142857142857142,0.014136904761904762 | |
| MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7142857142857142,0.014136904761904762 | |
| MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7142857142857142,0.014136904761904762 | |
| MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,3,0.7142857142857142,0.014136904761904762 | |
| MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,4,0.7142857142857142,0.014136904761904762 | |
| MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,5,0.7857142857142856,0.005505952380952381 | |
| MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,6,0.7142857142857142,0.014136904761904762 | |
| MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,7,0.7142857142857142,0.014136904761904762 | |
| MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,8,0.8571428571428571,0.001736111111111111 | |
| MixEval TriviaQA Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,9,0.7142857142857142,0.014136904761904762 | |
| MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7142857142857142,0.014136904761904762 | |
| MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7857142857142856,0.005505952380952381 | |
| MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7142857142857142,0.014136904761904762 | |
| MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,3,0.7857142857142856,0.005505952380952381 | |
| MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,4,0.7857142857142856,0.005505952380952381 | |
| MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,5,0.8571428571428571,0.001736111111111111 | |
| MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,6,0.7857142857142856,0.005505952380952381 | |
| MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,7,0.7857142857142856,0.005505952380952381 | |
| MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,8,0.7857142857142856,0.005505952380952381 | |
| MixEval MMLU Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,9,0.7142857142857142,0.014136904761904762 | |
| MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,0,0.5714285714285714,0.06101190476190476 | |
| MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,1,0.6428571428571428,0.03115079365079365 | |
| MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,2,0.6428571428571428,0.03115079365079365 | |
| MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,3,0.5714285714285714,0.06101190476190476 | |
| MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,4,0.5714285714285714,0.06101190476190476 | |
| MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,5,0.5714285714285714,0.06101190476190476 | |
| MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,6,0.6428571428571428,0.03115079365079365 | |
| MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,7,0.5714285714285714,0.06101190476190476 | |
| MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,8,0.5714285714285714,0.06101190476190476 | |
| MixEval DROP Hard,mixeval_240829.csv,aggregate,aggregate,kendall,random,8,9,0.6428571428571428,0.03115079365079365 | |
| LiveBench 240725,livebench_240829.csv,aggregate,aggregate,kendall,random,8,0,0.9285714285714285,0.0003968253968253968 | |
| LiveBench 240725,livebench_240829.csv,aggregate,aggregate,kendall,random,8,1,0.9285714285714285,0.0003968253968253968 | |
| LiveBench 240725,livebench_240829.csv,aggregate,aggregate,kendall,random,8,2,0.9285714285714285,0.0003968253968253968 | |
| LiveBench 240725,livebench_240829.csv,aggregate,aggregate,kendall,random,8,3,0.8571428571428571,0.001736111111111111 | |
| LiveBench 240725,livebench_240829.csv,aggregate,aggregate,kendall,random,8,4,0.9285714285714285,0.0003968253968253968 | |
| LiveBench 240725,livebench_240829.csv,aggregate,aggregate,kendall,random,8,5,0.8571428571428571,0.001736111111111111 | |
| LiveBench 240725,livebench_240829.csv,aggregate,aggregate,kendall,random,8,6,0.9285714285714285,0.0003968253968253968 | |
| LiveBench 240725,livebench_240829.csv,aggregate,aggregate,kendall,random,8,7,0.8571428571428571,0.001736111111111111 | |
| LiveBench 240725,livebench_240829.csv,aggregate,aggregate,kendall,random,8,8,0.9285714285714285,0.0003968253968253968 | |
| LiveBench 240725,livebench_240829.csv,aggregate,aggregate,kendall,random,8,9,0.8571428571428571,0.001736111111111111 | |
| LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,kendall,random,8,0,0.6910233190806425,0.017844011512848347 | |
| LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,kendall,random,8,1,0.9819805060619657,0.0007619896395304237 | |
| LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,kendall,random,8,2,0.8571428571428571,0.001736111111111111 | |
| LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,kendall,random,8,3,0.9819805060619657,0.0007619896395304237 | |
| LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,kendall,random,8,4,0.9092412093166348,0.0018276750354536814 | |
| LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,kendall,random,8,5,0.9819805060619657,0.0007619896395304237 | |
| LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,kendall,random,8,6,0.7857142857142856,0.005505952380952381 | |
| LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,kendall,random,8,7,0.9092412093166348,0.0018276750354536814 | |
| LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,kendall,random,8,8,0.8571428571428571,0.001736111111111111 | |
| LiveBench Reasoning,livebench_240829.csv,aggregate,aggregate,kendall,random,8,9,0.8571428571428571,0.001736111111111111 | |
| LiveBench Coding,livebench_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7857142857142856,0.005505952380952381 | |
| LiveBench Coding,livebench_240829.csv,aggregate,aggregate,kendall,random,8,1,0.9999999999999998,4.96031746031746e-05 | |
| LiveBench Coding,livebench_240829.csv,aggregate,aggregate,kendall,random,8,2,0.9285714285714285,0.0003968253968253968 | |
| LiveBench Coding,livebench_240829.csv,aggregate,aggregate,kendall,random,8,3,0.7142857142857142,0.014136904761904762 | |
| LiveBench Coding,livebench_240829.csv,aggregate,aggregate,kendall,random,8,4,0.9285714285714285,0.0003968253968253968 | |
| LiveBench Coding,livebench_240829.csv,aggregate,aggregate,kendall,random,8,5,0.9285714285714285,0.0003968253968253968 | |
| LiveBench Coding,livebench_240829.csv,aggregate,aggregate,kendall,random,8,6,0.7857142857142856,0.005505952380952381 | |
| LiveBench Coding,livebench_240829.csv,aggregate,aggregate,kendall,random,8,7,0.9285714285714285,0.0003968253968253968 | |
| LiveBench Coding,livebench_240829.csv,aggregate,aggregate,kendall,random,8,8,0.7857142857142856,0.005505952380952381 | |
| LiveBench Coding,livebench_240829.csv,aggregate,aggregate,kendall,random,8,9,0.8571428571428571,0.001736111111111111 | |
| LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,kendall,random,8,0,0.8571428571428571,0.001736111111111111 | |
| LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,kendall,random,8,1,0.9999999999999998,4.96031746031746e-05 | |
| LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,kendall,random,8,2,0.9285714285714285,0.0003968253968253968 | |
| LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,kendall,random,8,3,0.8571428571428571,0.001736111111111111 | |
| LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,kendall,random,8,4,0.9999999999999998,4.96031746031746e-05 | |
| LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,kendall,random,8,5,0.9999999999999998,4.96031746031746e-05 | |
| LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,kendall,random,8,6,0.8571428571428571,0.001736111111111111 | |
| LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,kendall,random,8,7,0.9999999999999998,4.96031746031746e-05 | |
| LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,kendall,random,8,8,0.9285714285714285,0.0003968253968253968 | |
| LiveBench Mathematics,livebench_240829.csv,aggregate,aggregate,kendall,random,8,9,0.9285714285714285,0.0003968253968253968 | |
| LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,kendall,random,8,0,0.5714285714285714,0.06101190476190476 | |
| LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,kendall,random,8,1,0.8571428571428571,0.001736111111111111 | |
| LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,kendall,random,8,2,0.5714285714285714,0.06101190476190476 | |
| LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,kendall,random,8,3,0.3571428571428571,0.27509920634920637 | |
| LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,kendall,random,8,4,0.6428571428571428,0.03115079365079365 | |
| LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,kendall,random,8,5,0.4999999999999999,0.10868055555555556 | |
| LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,kendall,random,8,6,0.3571428571428571,0.27509920634920637 | |
| LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,kendall,random,8,7,0.42857142857142855,0.17886904761904762 | |
| LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,kendall,random,8,8,0.6428571428571428,0.03115079365079365 | |
| LiveBench Data Analysis,livebench_240829.csv,aggregate,aggregate,kendall,random,8,9,0.4999999999999999,0.10868055555555556 | |
| LiveBench Language,livebench_240829.csv,aggregate,aggregate,kendall,random,8,0,0.42857142857142855,0.17886904761904762 | |
| LiveBench Language,livebench_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7142857142857142,0.014136904761904762 | |
| LiveBench Language,livebench_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7857142857142856,0.005505952380952381 | |
| LiveBench Language,livebench_240829.csv,aggregate,aggregate,kendall,random,8,3,0.5714285714285714,0.06101190476190476 | |
| LiveBench Language,livebench_240829.csv,aggregate,aggregate,kendall,random,8,4,0.6428571428571428,0.03115079365079365 | |
| LiveBench Language,livebench_240829.csv,aggregate,aggregate,kendall,random,8,5,0.5714285714285714,0.06101190476190476 | |
| LiveBench Language,livebench_240829.csv,aggregate,aggregate,kendall,random,8,6,0.6428571428571428,0.03115079365079365 | |
| LiveBench Language,livebench_240829.csv,aggregate,aggregate,kendall,random,8,7,0.5714285714285714,0.06101190476190476 | |
| LiveBench Language,livebench_240829.csv,aggregate,aggregate,kendall,random,8,8,0.7142857142857142,0.014136904761904762 | |
| LiveBench Language,livebench_240829.csv,aggregate,aggregate,kendall,random,8,9,0.5714285714285714,0.06101190476190476 | |
| LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,kendall,random,8,0,0.6428571428571428,0.03115079365079365 | |
| LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,kendall,random,8,1,0.6428571428571428,0.03115079365079365 | |
| LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7142857142857142,0.014136904761904762 | |
| LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,kendall,random,8,3,0.6428571428571428,0.03115079365079365 | |
| LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,kendall,random,8,4,0.4999999999999999,0.10868055555555556 | |
| LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,kendall,random,8,5,0.5714285714285714,0.06101190476190476 | |
| LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,kendall,random,8,6,0.5714285714285714,0.06101190476190476 | |
| LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,kendall,random,8,7,0.4999999999999999,0.10868055555555556 | |
| LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,kendall,random,8,8,0.7857142857142856,0.005505952380952381 | |
| LiveBench Instruction Following,livebench_240829.csv,aggregate,aggregate,kendall,random,8,9,0.4999999999999999,0.10868055555555556 | |
| WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7857142857142856,0.005505952380952381 | |
| WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,1,0.9285714285714285,0.0003968253968253968 | |
| WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7857142857142856,0.005505952380952381 | |
| WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,3,0.9285714285714285,0.0003968253968253968 | |
| WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,4,0.7857142857142856,0.005505952380952381 | |
| WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,5,0.7857142857142856,0.005505952380952381 | |
| WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,6,0.7857142857142856,0.005505952380952381 | |
| WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,7,0.7857142857142856,0.005505952380952381 | |
| WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,8,0.7857142857142856,0.005505952380952381 | |
| WildBench Elo LC,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,9,0.7857142857142856,0.005505952380952381 | |
| WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7857142857142856,0.005505952380952381 | |
| WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,1,0.8571428571428571,0.001736111111111111 | |
| WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7142857142857142,0.014136904761904762 | |
| WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,3,0.8571428571428571,0.001736111111111111 | |
| WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,4,0.7857142857142856,0.005505952380952381 | |
| WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,5,0.7857142857142856,0.005505952380952381 | |
| WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,6,0.7142857142857142,0.014136904761904762 | |
| WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,7,0.7142857142857142,0.014136904761904762 | |
| WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,8,0.7857142857142856,0.005505952380952381 | |
| WildBench Information Seeking,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,9,0.7142857142857142,0.014136904761904762 | |
| WildBench Creative,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,0,0.6428571428571428,0.03115079365079365 | |
| WildBench Creative,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7142857142857142,0.014136904761904762 | |
| WildBench Creative,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,2,0.6428571428571428,0.03115079365079365 | |
| WildBench Creative,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,3,0.7857142857142856,0.005505952380952381 | |
| WildBench Creative,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,4,0.6428571428571428,0.03115079365079365 | |
| WildBench Creative,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,5,0.7142857142857142,0.014136904761904762 | |
| WildBench Creative,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,6,0.6428571428571428,0.03115079365079365 | |
| WildBench Creative,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,7,0.6428571428571428,0.03115079365079365 | |
| WildBench Creative,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,8,0.6428571428571428,0.03115079365079365 | |
| WildBench Creative,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,9,0.5714285714285714,0.06101190476190476 | |
| WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7857142857142856,0.005505952380952381 | |
| WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,1,0.8571428571428571,0.001736111111111111 | |
| WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,2,0.8571428571428571,0.001736111111111111 | |
| WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,3,0.9285714285714285,0.0003968253968253968 | |
| WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,4,0.7857142857142856,0.005505952380952381 | |
| WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,5,0.8571428571428571,0.001736111111111111 | |
| WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,6,0.8571428571428571,0.001736111111111111 | |
| WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,7,0.8571428571428571,0.001736111111111111 | |
| WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,8,0.7857142857142856,0.005505952380952381 | |
| WildBench Code Debugging,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,9,0.7857142857142856,0.005505952380952381 | |
| WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7857142857142856,0.005505952380952381 | |
| WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,1,0.8571428571428571,0.001736111111111111 | |
| WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,2,0.8571428571428571,0.001736111111111111 | |
| WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,3,0.9285714285714285,0.0003968253968253968 | |
| WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,4,0.7857142857142856,0.005505952380952381 | |
| WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,5,0.8571428571428571,0.001736111111111111 | |
| WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,6,0.8571428571428571,0.001736111111111111 | |
| WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,7,0.8571428571428571,0.001736111111111111 | |
| WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,8,0.7857142857142856,0.005505952380952381 | |
| WildBench Math & Data,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,9,0.7857142857142856,0.005505952380952381 | |
| WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7857142857142856,0.005505952380952381 | |
| WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7857142857142856,0.005505952380952381 | |
| WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7857142857142856,0.005505952380952381 | |
| WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,3,0.8571428571428571,0.001736111111111111 | |
| WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,4,0.7857142857142856,0.005505952380952381 | |
| WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,5,0.8571428571428571,0.001736111111111111 | |
| WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,6,0.7857142857142856,0.005505952380952381 | |
| WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,7,0.7857142857142856,0.005505952380952381 | |
| WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,8,0.7857142857142856,0.005505952380952381 | |
| WildBench Reasoning & Planning,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,9,0.7142857142857142,0.014136904761904762 | |
| WildBench Score,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,0,0.7857142857142856,0.005505952380952381 | |
| WildBench Score,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,1,0.7857142857142856,0.005505952380952381 | |
| WildBench Score,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,2,0.7857142857142856,0.005505952380952381 | |
| WildBench Score,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,3,0.8571428571428571,0.001736111111111111 | |
| WildBench Score,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,4,0.7857142857142856,0.005505952380952381 | |
| WildBench Score,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,5,0.8571428571428571,0.001736111111111111 | |
| WildBench Score,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,6,0.7857142857142856,0.005505952380952381 | |
| WildBench Score,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,7,0.7857142857142856,0.005505952380952381 | |
| WildBench Score,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,8,0.7857142857142856,0.005505952380952381 | |
| WildBench Score,wildbench_240829.csv,aggregate,aggregate,kendall,random,8,9,0.7142857142857142,0.014136904761904762 | |
| Arena Hard,arena_hard_240829.csv,aggregate,aggregate,kendall,random,8,0,0.9285714285714285,0.0003968253968253968 | |
| Arena Hard,arena_hard_240829.csv,aggregate,aggregate,kendall,random,8,1,0.9285714285714285,0.0003968253968253968 | |
| Arena Hard,arena_hard_240829.csv,aggregate,aggregate,kendall,random,8,2,0.8571428571428571,0.001736111111111111 | |
| Arena Hard,arena_hard_240829.csv,aggregate,aggregate,kendall,random,8,3,0.8571428571428571,0.001736111111111111 | |
| Arena Hard,arena_hard_240829.csv,aggregate,aggregate,kendall,random,8,4,0.8571428571428571,0.001736111111111111 | |
| Arena Hard,arena_hard_240829.csv,aggregate,aggregate,kendall,random,8,5,0.7857142857142856,0.005505952380952381 | |
| Arena Hard,arena_hard_240829.csv,aggregate,aggregate,kendall,random,8,6,0.8571428571428571,0.001736111111111111 | |
| Arena Hard,arena_hard_240829.csv,aggregate,aggregate,kendall,random,8,7,0.8571428571428571,0.001736111111111111 | |
| Arena Hard,arena_hard_240829.csv,aggregate,aggregate,kendall,random,8,8,0.8571428571428571,0.001736111111111111 | |
| Arena Hard,arena_hard_240829.csv,aggregate,aggregate,kendall,random,8,9,0.8571428571428571,0.001736111111111111 | |
| LiveBench 240624,livebench_240701.csv,aggregate,aggregate,kendall,random,8,0,0.9285714285714285,0.0003968253968253968 | |
| LiveBench 240624,livebench_240701.csv,aggregate,aggregate,kendall,random,8,1,0.8571428571428571,0.001736111111111111 | |
| LiveBench 240624,livebench_240701.csv,aggregate,aggregate,kendall,random,8,2,0.9285714285714285,0.0003968253968253968 | |
| LiveBench 240624,livebench_240701.csv,aggregate,aggregate,kendall,random,8,3,0.9285714285714285,0.0003968253968253968 | |
| LiveBench 240624,livebench_240701.csv,aggregate,aggregate,kendall,random,8,4,0.9999999999999998,4.96031746031746e-05 | |
| LiveBench 240624,livebench_240701.csv,aggregate,aggregate,kendall,random,8,5,0.9285714285714285,0.0003968253968253968 | |
| LiveBench 240624,livebench_240701.csv,aggregate,aggregate,kendall,random,8,6,0.8571428571428571,0.001736111111111111 | |
| LiveBench 240624,livebench_240701.csv,aggregate,aggregate,kendall,random,8,7,0.8571428571428571,0.001736111111111111 | |
| LiveBench 240624,livebench_240701.csv,aggregate,aggregate,kendall,random,8,8,0.9285714285714285,0.0003968253968253968 | |
| LiveBench 240624,livebench_240701.csv,aggregate,aggregate,kendall,random,8,9,0.8571428571428571,0.001736111111111111 | |
| LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,0,0.6910233190806425,0.017844011512848347 | |
| LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,1,0.6428571428571428,0.03115079365079365 | |
| LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,2,0.6182840223353117,0.0340492747686748 | |
| LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,3,0.9285714285714285,0.0003968253968253968 | |
| LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,4,0.8571428571428571,0.001736111111111111 | |
| LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,5,0.7857142857142856,0.005505952380952381 | |
| LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,6,0.7142857142857142,0.014136904761904762 | |
| LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,7,0.7637626158259734,0.008839740160738534 | |
| LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,8,0.7637626158259734,0.008839740160738534 | |
| LiveBench Reasoning Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,9,0.7637626158259734,0.008839740160738534 | |
| LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,0,0.8571428571428571,0.001736111111111111 | |
| LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,1,0.7857142857142856,0.005505952380952381 | |
| LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,2,0.7142857142857142,0.014136904761904762 | |
| LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,3,0.8571428571428571,0.001736111111111111 | |
| LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,4,0.8571428571428571,0.001736111111111111 | |
| LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,5,0.7142857142857142,0.014136904761904762 | |
| LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,6,0.8571428571428571,0.001736111111111111 | |
| LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,7,0.6428571428571428,0.03115079365079365 | |
| LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,8,0.8571428571428571,0.001736111111111111 | |
| LiveBench Coding Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,9,0.8571428571428571,0.001736111111111111 | |
| LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,0,0.9285714285714285,0.0003968253968253968 | |
| LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,1,0.8571428571428571,0.001736111111111111 | |
| LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,2,0.8571428571428571,0.001736111111111111 | |
| LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,3,0.8571428571428571,0.001736111111111111 | |
| LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,4,0.8571428571428571,0.001736111111111111 | |
| LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,5,0.7857142857142856,0.005505952380952381 | |
| LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,6,0.9999999999999998,4.96031746031746e-05 | |
| LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,7,0.8571428571428571,0.001736111111111111 | |
| LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,8,0.9999999999999998,4.96031746031746e-05 | |
| LiveBench Mathematics Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,9,0.9999999999999998,4.96031746031746e-05 | |
| LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,0,0.7142857142857142,0.014136904761904762 | |
| LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,1,0.21428571428571427,0.5484126984126985 | |
| LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,2,0.6428571428571428,0.03115079365079365 | |
| LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,3,0.5714285714285714,0.06101190476190476 | |
| LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,4,0.4999999999999999,0.10868055555555556 | |
| LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,5,0.3571428571428571,0.27509920634920637 | |
| LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,6,0.42857142857142855,0.17886904761904762 | |
| LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,7,0.3571428571428571,0.27509920634920637 | |
| LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,8,0.5714285714285714,0.06101190476190476 | |
| LiveBench Data Analysis Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,9,0.42857142857142855,0.17886904761904762 | |
| LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,0,0.6428571428571428,0.03115079365079365 | |
| LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,1,0.6428571428571428,0.03115079365079365 | |
| LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,2,0.7142857142857142,0.014136904761904762 | |
| LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,3,0.5714285714285714,0.06101190476190476 | |
| LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,4,0.7857142857142856,0.005505952380952381 | |
| LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,5,0.5714285714285714,0.06101190476190476 | |
| LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,6,0.6428571428571428,0.03115079365079365 | |
| LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,7,0.5714285714285714,0.06101190476190476 | |
| LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,8,0.5714285714285714,0.06101190476190476 | |
| LiveBench Language Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,9,0.7142857142857142,0.014136904761904762 | |
| LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,0,0.7857142857142856,0.005505952380952381 | |
| LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,1,0.7142857142857142,0.014136904761904762 | |
| LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,2,0.6428571428571428,0.03115079365079365 | |
| LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,3,0.7142857142857142,0.014136904761904762 | |
| LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,4,0.8571428571428571,0.001736111111111111 | |
| LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,5,0.6428571428571428,0.03115079365079365 | |
| LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,6,0.5714285714285714,0.06101190476190476 | |
| LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,7,0.6428571428571428,0.03115079365079365 | |
| LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,8,0.8571428571428571,0.001736111111111111 | |
| LiveBench Instruction Following Average,livebench_240701.csv,aggregate,aggregate,kendall,random,8,9,0.7142857142857142,0.014136904761904762 | |
| aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,kendall,random,8,0,0.8571428571428571,0.001736111111111111 | |
| aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,kendall,random,8,1,0.8571428571428571,0.001736111111111111 | |
| aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,kendall,random,8,2,0.8571428571428571,0.001736111111111111 | |
| aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,kendall,random,8,3,0.8571428571428571,0.001736111111111111 | |
| aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,kendall,random,8,4,0.8571428571428571,0.001736111111111111 | |
| aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,kendall,random,8,5,0.8571428571428571,0.001736111111111111 | |
| aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,kendall,random,8,6,0.8571428571428571,0.001736111111111111 | |
| aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,kendall,random,8,7,0.8571428571428571,0.001736111111111111 | |
| aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,kendall,random,8,8,0.8571428571428571,0.001736111111111111 | |
| aggregate,aggregate,HF OpenLLM v2,hf_open_llm_v2_240829.csv,kendall,random,8,9,0.8571428571428571,0.001736111111111111 | |
| aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,kendall,random,8,0,0.8571428571428571,0.001736111111111111 | |
| aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,kendall,random,8,1,0.8571428571428571,0.001736111111111111 | |
| aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,kendall,random,8,2,0.8571428571428571,0.001736111111111111 | |
| aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,kendall,random,8,3,0.8571428571428571,0.001736111111111111 | |
| aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,kendall,random,8,4,0.8571428571428571,0.001736111111111111 | |
| aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,kendall,random,8,5,0.8571428571428571,0.001736111111111111 | |
| aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,kendall,random,8,6,0.8571428571428571,0.001736111111111111 | |
| aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,kendall,random,8,7,0.8571428571428571,0.001736111111111111 | |
| aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,kendall,random,8,8,0.8571428571428571,0.001736111111111111 | |
| aggregate,aggregate,HFv2 BBH,hf_open_llm_v2_240829.csv,kendall,random,8,9,0.8571428571428571,0.001736111111111111 | |
| aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,kendall,random,8,0,0.4999999999999999,0.10868055555555556 | |
| aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,kendall,random,8,1,0.4999999999999999,0.10868055555555556 | |
| aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,kendall,random,8,2,0.4999999999999999,0.10868055555555556 | |
| aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,kendall,random,8,3,0.4999999999999999,0.10868055555555556 | |
| aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,kendall,random,8,4,0.4999999999999999,0.10868055555555556 | |
| aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,kendall,random,8,5,0.4999999999999999,0.10868055555555556 | |
| aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,kendall,random,8,6,0.4999999999999999,0.10868055555555556 | |
| aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,kendall,random,8,7,0.4999999999999999,0.10868055555555556 | |
| aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,kendall,random,8,8,0.4999999999999999,0.10868055555555556 | |
| aggregate,aggregate,HFv2 GPQA,hf_open_llm_v2_240829.csv,kendall,random,8,9,0.4999999999999999,0.10868055555555556 | |
| aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,kendall,random,8,0,0.7857142857142856,0.005505952380952381 | |
| aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,kendall,random,8,1,0.7857142857142856,0.005505952380952381 | |
| aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,kendall,random,8,2,0.7857142857142856,0.005505952380952381 | |
| aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,kendall,random,8,3,0.7857142857142856,0.005505952380952381 | |
| aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,kendall,random,8,4,0.7857142857142856,0.005505952380952381 | |
| aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,kendall,random,8,5,0.7857142857142856,0.005505952380952381 | |
| aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,kendall,random,8,6,0.7857142857142856,0.005505952380952381 | |
| aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,kendall,random,8,7,0.7857142857142856,0.005505952380952381 | |
| aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,kendall,random,8,8,0.7857142857142856,0.005505952380952381 | |
| aggregate,aggregate,HFv2 IFEval,hf_open_llm_v2_240829.csv,kendall,random,8,9,0.7857142857142856,0.005505952380952381 | |
| aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,kendall,random,8,0,0.8571428571428571,0.001736111111111111 | |
| aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,kendall,random,8,1,0.8571428571428571,0.001736111111111111 | |
| aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,kendall,random,8,2,0.8571428571428571,0.001736111111111111 | |
| aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,kendall,random,8,3,0.8571428571428571,0.001736111111111111 | |
| aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,kendall,random,8,4,0.8571428571428571,0.001736111111111111 | |
| aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,kendall,random,8,5,0.8571428571428571,0.001736111111111111 | |
| aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,kendall,random,8,6,0.8571428571428571,0.001736111111111111 | |
| aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,kendall,random,8,7,0.8571428571428571,0.001736111111111111 | |
| aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,kendall,random,8,8,0.8571428571428571,0.001736111111111111 | |
| aggregate,aggregate,HFv2 MMLU Pro,hf_open_llm_v2_240829.csv,kendall,random,8,9,0.8571428571428571,0.001736111111111111 | |
| aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,kendall,random,8,0,0.3706246583305506,0.20891238174069848 | |
| aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,kendall,random,8,1,0.3706246583305506,0.20891238174069848 | |
| aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,kendall,random,8,2,0.3706246583305506,0.20891238174069848 | |
| aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,kendall,random,8,3,0.3706246583305506,0.20891238174069848 | |
| aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,kendall,random,8,4,0.3706246583305506,0.20891238174069848 | |
| aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,kendall,random,8,5,0.3706246583305506,0.20891238174069848 | |
| aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,kendall,random,8,6,0.3706246583305506,0.20891238174069848 | |
| aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,kendall,random,8,7,0.3706246583305506,0.20891238174069848 | |
| aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,kendall,random,8,8,0.3706246583305506,0.20891238174069848 | |
| aggregate,aggregate,HFv2 Math Level 5,hf_open_llm_v2_240829.csv,kendall,random,8,9,0.3706246583305506,0.20891238174069848 | |
| aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,kendall,random,8,0,0.5714285714285714,0.06101190476190476 | |
| aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,kendall,random,8,1,0.5714285714285714,0.06101190476190476 | |
| aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,kendall,random,8,2,0.5714285714285714,0.06101190476190476 | |
| aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,kendall,random,8,3,0.5714285714285714,0.06101190476190476 | |
| aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,kendall,random,8,4,0.5714285714285714,0.06101190476190476 | |
| aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,kendall,random,8,5,0.5714285714285714,0.06101190476190476 | |
| aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,kendall,random,8,6,0.5714285714285714,0.06101190476190476 | |
| aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,kendall,random,8,7,0.5714285714285714,0.06101190476190476 | |
| aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,kendall,random,8,8,0.5714285714285714,0.06101190476190476 | |
| aggregate,aggregate,HFv2 MuSR,hf_open_llm_v2_240829.csv,kendall,random,8,9,0.5714285714285714,0.06101190476190476 | |
| aggregate,aggregate,LMSys Arena,chatbot_arena_240829.csv,kendall,random,8,0,0.9285714285714285,0.0003968253968253968 | |
| aggregate,aggregate,LMSys Arena,chatbot_arena_240829.csv,kendall,random,8,1,0.9285714285714285,0.0003968253968253968 | |
| aggregate,aggregate,LMSys Arena,chatbot_arena_240829.csv,kendall,random,8,2,0.8571428571428571,0.001736111111111111 | |
| aggregate,aggregate,LMSys Arena,chatbot_arena_240829.csv,kendall,random,8,3,0.8571428571428571,0.001736111111111111 | |
| aggregate,aggregate,LMSys Arena,chatbot_arena_240829.csv,kendall,random,8,4,0.8571428571428571,0.001736111111111111 | |
| aggregate,aggregate,LMSys Arena,chatbot_arena_240829.csv,kendall,random,8,5,0.7857142857142856,0.005505952380952381 | |
| aggregate,aggregate,LMSys Arena,chatbot_arena_240829.csv,kendall,random,8,6,0.8571428571428571,0.001736111111111111 | |
| aggregate,aggregate,LMSys Arena,chatbot_arena_240829.csv,kendall,random,8,7,0.8571428571428571,0.001736111111111111 | |
| aggregate,aggregate,LMSys Arena,chatbot_arena_240829.csv,kendall,random,8,8,0.8571428571428571,0.001736111111111111 | |
| aggregate,aggregate,LMSys Arena,chatbot_arena_240829.csv,kendall,random,8,9,0.8571428571428571,0.001736111111111111 | |
| aggregate,aggregate,MixEval,mixeval_240829.csv,kendall,random,8,0,0.6428571428571428,0.03115079365079365 | |
| aggregate,aggregate,MixEval,mixeval_240829.csv,kendall,random,8,1,0.7857142857142856,0.005505952380952381 | |
| aggregate,aggregate,MixEval,mixeval_240829.csv,kendall,random,8,2,0.7857142857142856,0.005505952380952381 | |
| aggregate,aggregate,MixEval,mixeval_240829.csv,kendall,random,8,3,0.6428571428571428,0.03115079365079365 | |
| aggregate,aggregate,MixEval,mixeval_240829.csv,kendall,random,8,4,0.6428571428571428,0.03115079365079365 | |
| aggregate,aggregate,MixEval,mixeval_240829.csv,kendall,random,8,5,0.7857142857142856,0.005505952380952381 | |
| aggregate,aggregate,MixEval,mixeval_240829.csv,kendall,random,8,6,0.7857142857142856,0.005505952380952381 | |
| aggregate,aggregate,MixEval,mixeval_240829.csv,kendall,random,8,7,0.6428571428571428,0.03115079365079365 | |
| aggregate,aggregate,MixEval,mixeval_240829.csv,kendall,random,8,8,0.7142857142857142,0.014136904761904762 | |
| aggregate,aggregate,MixEval,mixeval_240829.csv,kendall,random,8,9,0.7857142857142856,0.005505952380952381 | |
| aggregate,aggregate,MixEval Hard,mixeval_240829.csv,kendall,random,8,0,0.9285714285714285,0.0003968253968253968 | |
| aggregate,aggregate,MixEval Hard,mixeval_240829.csv,kendall,random,8,1,0.9285714285714285,0.0003968253968253968 | |
| aggregate,aggregate,MixEval Hard,mixeval_240829.csv,kendall,random,8,2,0.9285714285714285,0.0003968253968253968 | |
| aggregate,aggregate,MixEval Hard,mixeval_240829.csv,kendall,random,8,3,0.9285714285714285,0.0003968253968253968 | |
| aggregate,aggregate,MixEval Hard,mixeval_240829.csv,kendall,random,8,4,0.9285714285714285,0.0003968253968253968 | |
| aggregate,aggregate,MixEval Hard,mixeval_240829.csv,kendall,random,8,5,0.9285714285714285,0.0003968253968253968 | |
| aggregate,aggregate,MixEval Hard,mixeval_240829.csv,kendall,random,8,6,0.9285714285714285,0.0003968253968253968 | |
| aggregate,aggregate,MixEval Hard,mixeval_240829.csv,kendall,random,8,7,0.9285714285714285,0.0003968253968253968 | |
| aggregate,aggregate,MixEval Hard,mixeval_240829.csv,kendall,random,8,8,0.9999999999999998,4.96031746031746e-05 | |
| aggregate,aggregate,MixEval Hard,mixeval_240829.csv,kendall,random,8,9,0.9285714285714285,0.0003968253968253968 | |
| aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,kendall,random,8,0,0.6428571428571428,0.03115079365079365 | |
| aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,kendall,random,8,1,0.6428571428571428,0.03115079365079365 | |
| aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,kendall,random,8,2,0.6428571428571428,0.03115079365079365 | |
| aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,kendall,random,8,3,0.6428571428571428,0.03115079365079365 | |
| aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,kendall,random,8,4,0.6428571428571428,0.03115079365079365 | |
| aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,kendall,random,8,5,0.7142857142857142,0.014136904761904762 | |
| aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,kendall,random,8,6,0.6428571428571428,0.03115079365079365 | |
| aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,kendall,random,8,7,0.6428571428571428,0.03115079365079365 | |
| aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,kendall,random,8,8,0.7142857142857142,0.014136904761904762 | |
| aggregate,aggregate,MixEval TriviaQA,mixeval_240829.csv,kendall,random,8,9,0.6428571428571428,0.03115079365079365 | |
| aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,kendall,random,8,0,0.7142857142857142,0.014136904761904762 | |
| aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,kendall,random,8,1,0.8571428571428571,0.001736111111111111 | |
| aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,kendall,random,8,2,0.8571428571428571,0.001736111111111111 | |
| aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,kendall,random,8,3,0.7142857142857142,0.014136904761904762 | |
| aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,kendall,random,8,4,0.7142857142857142,0.014136904761904762 | |
| aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,kendall,random,8,5,0.8571428571428571,0.001736111111111111 | |
| aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,kendall,random,8,6,0.8571428571428571,0.001736111111111111 | |
| aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,kendall,random,8,7,0.7142857142857142,0.014136904761904762 | |
| aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,kendall,random,8,8,0.7142857142857142,0.014136904761904762 | |
| aggregate,aggregate,MixEval MMLU,mixeval_240829.csv,kendall,random,8,9,0.8571428571428571,0.001736111111111111 | |
| aggregate,aggregate,MixEval DROP,mixeval_240829.csv,kendall,random,8,0,0.7142857142857142,0.014136904761904762 | |
| aggregate,aggregate,MixEval DROP,mixeval_240829.csv,kendall,random,8,1,0.6428571428571428,0.03115079365079365 | |
| aggregate,aggregate,MixEval DROP,mixeval_240829.csv,kendall,random,8,2,0.7142857142857142,0.014136904761904762 | |
| aggregate,aggregate,MixEval DROP,mixeval_240829.csv,kendall,random,8,3,0.6428571428571428,0.03115079365079365 | |
| aggregate,aggregate,MixEval DROP,mixeval_240829.csv,kendall,random,8,4,0.6428571428571428,0.03115079365079365 | |
| aggregate,aggregate,MixEval DROP,mixeval_240829.csv,kendall,random,8,5,0.7857142857142856,0.005505952380952381 | |
| aggregate,aggregate,MixEval DROP,mixeval_240829.csv,kendall,random,8,6,0.6428571428571428,0.03115079365079365 | |
| aggregate,aggregate,MixEval DROP,mixeval_240829.csv,kendall,random,8,7,0.6428571428571428,0.03115079365079365 | |
| aggregate,aggregate,MixEval DROP,mixeval_240829.csv,kendall,random,8,8,0.7142857142857142,0.014136904761904762 | |
| aggregate,aggregate,MixEval DROP,mixeval_240829.csv,kendall,random,8,9,0.7142857142857142,0.014136904761904762 | |
| aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,kendall,random,8,0,0.6428571428571428,0.03115079365079365 | |
| aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,kendall,random,8,1,0.5714285714285714,0.06101190476190476 | |
| aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,kendall,random,8,2,0.6428571428571428,0.03115079365079365 | |
| aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,kendall,random,8,3,0.42857142857142855,0.17886904761904762 | |
| aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,kendall,random,8,4,0.42857142857142855,0.17886904761904762 | |
| aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,kendall,random,8,5,0.5714285714285714,0.06101190476190476 | |
| aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,kendall,random,8,6,0.5714285714285714,0.06101190476190476 | |
| aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,kendall,random,8,7,0.42857142857142855,0.17886904761904762 | |
| aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,kendall,random,8,8,0.5714285714285714,0.06101190476190476 | |
| aggregate,aggregate,MixEval HellaSwag,mixeval_240829.csv,kendall,random,8,9,0.6428571428571428,0.03115079365079365 | |
| aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,kendall,random,8,0,0.836501912571304,0.004136737098676645 | |
| aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,kendall,random,8,1,0.6910233190806425,0.017844011512848347 | |
| aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,kendall,random,8,2,0.6910233190806425,0.017844011512848347 | |
| aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,kendall,random,8,3,0.5929994533288809,0.04437842734548688 | |
| aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,kendall,random,8,4,0.5929994533288809,0.04437842734548688 | |
| aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,kendall,random,8,5,0.6671243849949912,0.02370907646413876 | |
| aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,kendall,random,8,6,0.6910233190806425,0.017844011512848347 | |
| aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,kendall,random,8,7,0.5929994533288809,0.04437842734548688 | |
| aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,kendall,random,8,8,0.6671243849949912,0.02370907646413876 | |
| aggregate,aggregate,MixEval CommonsenseQA,mixeval_240829.csv,kendall,random,8,9,0.6910233190806425,0.017844011512848347 | |
| aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,kendall,random,8,0,0.7142857142857142,0.014136904761904762 | |
| aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,kendall,random,8,1,0.7142857142857142,0.014136904761904762 | |
| aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,kendall,random,8,2,0.7142857142857142,0.014136904761904762 | |
| aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,kendall,random,8,3,0.7142857142857142,0.014136904761904762 | |
| aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,kendall,random,8,4,0.7142857142857142,0.014136904761904762 | |
| aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,kendall,random,8,5,0.7857142857142856,0.005505952380952381 | |
| aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,kendall,random,8,6,0.7142857142857142,0.014136904761904762 | |
| aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,kendall,random,8,7,0.7142857142857142,0.014136904761904762 | |
| aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,kendall,random,8,8,0.8571428571428571,0.001736111111111111 | |
| aggregate,aggregate,MixEval TriviaQA Hard,mixeval_240829.csv,kendall,random,8,9,0.7142857142857142,0.014136904761904762 | |
| aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,kendall,random,8,0,0.7142857142857142,0.014136904761904762 | |
| aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,kendall,random,8,1,0.7857142857142856,0.005505952380952381 | |
| aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,kendall,random,8,2,0.7142857142857142,0.014136904761904762 | |
| aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,kendall,random,8,3,0.7857142857142856,0.005505952380952381 | |
| aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,kendall,random,8,4,0.7857142857142856,0.005505952380952381 | |
| aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,kendall,random,8,5,0.8571428571428571,0.001736111111111111 | |
| aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,kendall,random,8,6,0.7857142857142856,0.005505952380952381 | |
| aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,kendall,random,8,7,0.7857142857142856,0.005505952380952381 | |
| aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,kendall,random,8,8,0.7857142857142856,0.005505952380952381 | |
| aggregate,aggregate,MixEval MMLU Hard,mixeval_240829.csv,kendall,random,8,9,0.7142857142857142,0.014136904761904762 | |
| aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,kendall,random,8,0,0.5714285714285714,0.06101190476190476 | |
| aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,kendall,random,8,1,0.6428571428571428,0.03115079365079365 | |
| aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,kendall,random,8,2,0.6428571428571428,0.03115079365079365 | |
| aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,kendall,random,8,3,0.5714285714285714,0.06101190476190476 | |
| aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,kendall,random,8,4,0.5714285714285714,0.06101190476190476 | |
| aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,kendall,random,8,5,0.5714285714285714,0.06101190476190476 | |
| aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,kendall,random,8,6,0.6428571428571428,0.03115079365079365 | |
| aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,kendall,random,8,7,0.5714285714285714,0.06101190476190476 | |
| aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,kendall,random,8,8,0.5714285714285714,0.06101190476190476 | |
| aggregate,aggregate,MixEval DROP Hard,mixeval_240829.csv,kendall,random,8,9,0.6428571428571428,0.03115079365079365 | |
| aggregate,aggregate,LiveBench 240725,livebench_240829.csv,kendall,random,8,0,0.9285714285714285,0.0003968253968253968 | |
| aggregate,aggregate,LiveBench 240725,livebench_240829.csv,kendall,random,8,1,0.9285714285714285,0.0003968253968253968 | |
| aggregate,aggregate,LiveBench 240725,livebench_240829.csv,kendall,random,8,2,0.9285714285714285,0.0003968253968253968 | |
| aggregate,aggregate,LiveBench 240725,livebench_240829.csv,kendall,random,8,3,0.8571428571428571,0.001736111111111111 | |
| aggregate,aggregate,LiveBench 240725,livebench_240829.csv,kendall,random,8,4,0.9285714285714285,0.0003968253968253968 | |
| aggregate,aggregate,LiveBench 240725,livebench_240829.csv,kendall,random,8,5,0.8571428571428571,0.001736111111111111 | |
| aggregate,aggregate,LiveBench 240725,livebench_240829.csv,kendall,random,8,6,0.9285714285714285,0.0003968253968253968 | |
| aggregate,aggregate,LiveBench 240725,livebench_240829.csv,kendall,random,8,7,0.8571428571428571,0.001736111111111111 | |
| aggregate,aggregate,LiveBench 240725,livebench_240829.csv,kendall,random,8,8,0.9285714285714285,0.0003968253968253968 | |
| aggregate,aggregate,LiveBench 240725,livebench_240829.csv,kendall,random,8,9,0.8571428571428571,0.001736111111111111 | |
| aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,kendall,random,8,0,0.6910233190806425,0.017844011512848347 | |
| aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,kendall,random,8,1,0.9819805060619657,0.0007619896395304237 | |
| aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,kendall,random,8,2,0.8571428571428571,0.001736111111111111 | |
| aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,kendall,random,8,3,0.9819805060619657,0.0007619896395304237 | |
| aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,kendall,random,8,4,0.9092412093166348,0.0018276750354536814 | |
| aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,kendall,random,8,5,0.9819805060619657,0.0007619896395304237 | |
| aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,kendall,random,8,6,0.7857142857142856,0.005505952380952381 | |
| aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,kendall,random,8,7,0.9092412093166348,0.0018276750354536814 | |
| aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,kendall,random,8,8,0.8571428571428571,0.001736111111111111 | |
| aggregate,aggregate,LiveBench Reasoning,livebench_240829.csv,kendall,random,8,9,0.8571428571428571,0.001736111111111111 | |
| aggregate,aggregate,LiveBench Coding,livebench_240829.csv,kendall,random,8,0,0.7857142857142856,0.005505952380952381 | |
| aggregate,aggregate,LiveBench Coding,livebench_240829.csv,kendall,random,8,1,0.9999999999999998,4.96031746031746e-05 | |
| aggregate,aggregate,LiveBench Coding,livebench_240829.csv,kendall,random,8,2,0.9285714285714285,0.0003968253968253968 | |
| aggregate,aggregate,LiveBench Coding,livebench_240829.csv,kendall,random,8,3,0.7142857142857142,0.014136904761904762 | |
| aggregate,aggregate,LiveBench Coding,livebench_240829.csv,kendall,random,8,4,0.9285714285714285,0.0003968253968253968 | |
| aggregate,aggregate,LiveBench Coding,livebench_240829.csv,kendall,random,8,5,0.9285714285714285,0.0003968253968253968 | |
| aggregate,aggregate,LiveBench Coding,livebench_240829.csv,kendall,random,8,6,0.7857142857142856,0.005505952380952381 | |
| aggregate,aggregate,LiveBench Coding,livebench_240829.csv,kendall,random,8,7,0.9285714285714285,0.0003968253968253968 | |
| aggregate,aggregate,LiveBench Coding,livebench_240829.csv,kendall,random,8,8,0.7857142857142856,0.005505952380952381 | |
| aggregate,aggregate,LiveBench Coding,livebench_240829.csv,kendall,random,8,9,0.8571428571428571,0.001736111111111111 | |
| aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,kendall,random,8,0,0.8571428571428571,0.001736111111111111 | |
| aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,kendall,random,8,1,0.9999999999999998,4.96031746031746e-05 | |
| aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,kendall,random,8,2,0.9285714285714285,0.0003968253968253968 | |
| aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,kendall,random,8,3,0.8571428571428571,0.001736111111111111 | |
| aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,kendall,random,8,4,0.9999999999999998,4.96031746031746e-05 | |
| aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,kendall,random,8,5,0.9999999999999998,4.96031746031746e-05 | |
| aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,kendall,random,8,6,0.8571428571428571,0.001736111111111111 | |
| aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,kendall,random,8,7,0.9999999999999998,4.96031746031746e-05 | |
| aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,kendall,random,8,8,0.9285714285714285,0.0003968253968253968 | |
| aggregate,aggregate,LiveBench Mathematics,livebench_240829.csv,kendall,random,8,9,0.9285714285714285,0.0003968253968253968 | |
| aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,kendall,random,8,0,0.5714285714285714,0.06101190476190476 | |
| aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,kendall,random,8,1,0.8571428571428571,0.001736111111111111 | |
| aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,kendall,random,8,2,0.5714285714285714,0.06101190476190476 | |
| aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,kendall,random,8,3,0.3571428571428571,0.27509920634920637 | |
| aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,kendall,random,8,4,0.6428571428571428,0.03115079365079365 | |
| aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,kendall,random,8,5,0.4999999999999999,0.10868055555555556 | |
| aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,kendall,random,8,6,0.3571428571428571,0.27509920634920637 | |
| aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,kendall,random,8,7,0.42857142857142855,0.17886904761904762 | |
| aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,kendall,random,8,8,0.6428571428571428,0.03115079365079365 | |
| aggregate,aggregate,LiveBench Data Analysis,livebench_240829.csv,kendall,random,8,9,0.4999999999999999,0.10868055555555556 | |
| aggregate,aggregate,LiveBench Language,livebench_240829.csv,kendall,random,8,0,0.42857142857142855,0.17886904761904762 | |
| aggregate,aggregate,LiveBench Language,livebench_240829.csv,kendall,random,8,1,0.7142857142857142,0.014136904761904762 | |
| aggregate,aggregate,LiveBench Language,livebench_240829.csv,kendall,random,8,2,0.7857142857142856,0.005505952380952381 | |
| aggregate,aggregate,LiveBench Language,livebench_240829.csv,kendall,random,8,3,0.5714285714285714,0.06101190476190476 | |
| aggregate,aggregate,LiveBench Language,livebench_240829.csv,kendall,random,8,4,0.6428571428571428,0.03115079365079365 | |
| aggregate,aggregate,LiveBench Language,livebench_240829.csv,kendall,random,8,5,0.5714285714285714,0.06101190476190476 | |
| aggregate,aggregate,LiveBench Language,livebench_240829.csv,kendall,random,8,6,0.6428571428571428,0.03115079365079365 | |
| aggregate,aggregate,LiveBench Language,livebench_240829.csv,kendall,random,8,7,0.5714285714285714,0.06101190476190476 | |
| aggregate,aggregate,LiveBench Language,livebench_240829.csv,kendall,random,8,8,0.7142857142857142,0.014136904761904762 | |
| aggregate,aggregate,LiveBench Language,livebench_240829.csv,kendall,random,8,9,0.5714285714285714,0.06101190476190476 | |
| aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,kendall,random,8,0,0.6428571428571428,0.03115079365079365 | |
| aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,kendall,random,8,1,0.6428571428571428,0.03115079365079365 | |
| aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,kendall,random,8,2,0.7142857142857142,0.014136904761904762 | |
| aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,kendall,random,8,3,0.6428571428571428,0.03115079365079365 | |
| aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,kendall,random,8,4,0.4999999999999999,0.10868055555555556 | |
| aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,kendall,random,8,5,0.5714285714285714,0.06101190476190476 | |
| aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,kendall,random,8,6,0.5714285714285714,0.06101190476190476 | |
| aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,kendall,random,8,7,0.4999999999999999,0.10868055555555556 | |
| aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,kendall,random,8,8,0.7857142857142856,0.005505952380952381 | |
| aggregate,aggregate,LiveBench Instruction Following,livebench_240829.csv,kendall,random,8,9,0.4999999999999999,0.10868055555555556 | |
| aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,kendall,random,8,0,0.7857142857142856,0.005505952380952381 | |
| aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,kendall,random,8,1,0.9285714285714285,0.0003968253968253968 | |
| aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,kendall,random,8,2,0.7857142857142856,0.005505952380952381 | |
| aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,kendall,random,8,3,0.9285714285714285,0.0003968253968253968 | |
| aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,kendall,random,8,4,0.7857142857142856,0.005505952380952381 | |
| aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,kendall,random,8,5,0.7857142857142856,0.005505952380952381 | |
| aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,kendall,random,8,6,0.7857142857142856,0.005505952380952381 | |
| aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,kendall,random,8,7,0.7857142857142856,0.005505952380952381 | |
| aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,kendall,random,8,8,0.7857142857142856,0.005505952380952381 | |
| aggregate,aggregate,WildBench Elo LC,wildbench_240829.csv,kendall,random,8,9,0.7857142857142856,0.005505952380952381 | |
| aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,kendall,random,8,0,0.7857142857142856,0.005505952380952381 | |
| aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,kendall,random,8,1,0.8571428571428571,0.001736111111111111 | |
| aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,kendall,random,8,2,0.7142857142857142,0.014136904761904762 | |
| aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,kendall,random,8,3,0.8571428571428571,0.001736111111111111 | |
| aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,kendall,random,8,4,0.7857142857142856,0.005505952380952381 | |
| aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,kendall,random,8,5,0.7857142857142856,0.005505952380952381 | |
| aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,kendall,random,8,6,0.7142857142857142,0.014136904761904762 | |
| aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,kendall,random,8,7,0.7142857142857142,0.014136904761904762 | |
| aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,kendall,random,8,8,0.7857142857142856,0.005505952380952381 | |
| aggregate,aggregate,WildBench Information Seeking,wildbench_240829.csv,kendall,random,8,9,0.7142857142857142,0.014136904761904762 | |
| aggregate,aggregate,WildBench Creative,wildbench_240829.csv,kendall,random,8,0,0.6428571428571428,0.03115079365079365 | |
| aggregate,aggregate,WildBench Creative,wildbench_240829.csv,kendall,random,8,1,0.7142857142857142,0.014136904761904762 | |
| aggregate,aggregate,WildBench Creative,wildbench_240829.csv,kendall,random,8,2,0.6428571428571428,0.03115079365079365 | |
| aggregate,aggregate,WildBench Creative,wildbench_240829.csv,kendall,random,8,3,0.7857142857142856,0.005505952380952381 | |
| aggregate,aggregate,WildBench Creative,wildbench_240829.csv,kendall,random,8,4,0.6428571428571428,0.03115079365079365 | |
| aggregate,aggregate,WildBench Creative,wildbench_240829.csv,kendall,random,8,5,0.7142857142857142,0.014136904761904762 | |
| aggregate,aggregate,WildBench Creative,wildbench_240829.csv,kendall,random,8,6,0.6428571428571428,0.03115079365079365 | |
| aggregate,aggregate,WildBench Creative,wildbench_240829.csv,kendall,random,8,7,0.6428571428571428,0.03115079365079365 | |
| aggregate,aggregate,WildBench Creative,wildbench_240829.csv,kendall,random,8,8,0.6428571428571428,0.03115079365079365 | |
| aggregate,aggregate,WildBench Creative,wildbench_240829.csv,kendall,random,8,9,0.5714285714285714,0.06101190476190476 | |
| aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,kendall,random,8,0,0.7857142857142856,0.005505952380952381 | |
| aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,kendall,random,8,1,0.8571428571428571,0.001736111111111111 | |
| aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,kendall,random,8,2,0.8571428571428571,0.001736111111111111 | |
| aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,kendall,random,8,3,0.9285714285714285,0.0003968253968253968 | |
| aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,kendall,random,8,4,0.7857142857142856,0.005505952380952381 | |
| aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,kendall,random,8,5,0.8571428571428571,0.001736111111111111 | |
| aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,kendall,random,8,6,0.8571428571428571,0.001736111111111111 | |
| aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,kendall,random,8,7,0.8571428571428571,0.001736111111111111 | |
| aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,kendall,random,8,8,0.7857142857142856,0.005505952380952381 | |
| aggregate,aggregate,WildBench Code Debugging,wildbench_240829.csv,kendall,random,8,9,0.7857142857142856,0.005505952380952381 | |
| aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,kendall,random,8,0,0.7857142857142856,0.005505952380952381 | |
| aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,kendall,random,8,1,0.8571428571428571,0.001736111111111111 | |
| aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,kendall,random,8,2,0.8571428571428571,0.001736111111111111 | |
| aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,kendall,random,8,3,0.9285714285714285,0.0003968253968253968 | |
| aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,kendall,random,8,4,0.7857142857142856,0.005505952380952381 | |
| aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,kendall,random,8,5,0.8571428571428571,0.001736111111111111 | |
| aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,kendall,random,8,6,0.8571428571428571,0.001736111111111111 | |
| aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,kendall,random,8,7,0.8571428571428571,0.001736111111111111 | |
| aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,kendall,random,8,8,0.7857142857142856,0.005505952380952381 | |
| aggregate,aggregate,WildBench Math & Data,wildbench_240829.csv,kendall,random,8,9,0.7857142857142856,0.005505952380952381 | |
| aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,kendall,random,8,0,0.7857142857142856,0.005505952380952381 | |
| aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,kendall,random,8,1,0.7857142857142856,0.005505952380952381 | |
| aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,kendall,random,8,2,0.7857142857142856,0.005505952380952381 | |
| aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,kendall,random,8,3,0.8571428571428571,0.001736111111111111 | |
| aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,kendall,random,8,4,0.7857142857142856,0.005505952380952381 | |
| aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,kendall,random,8,5,0.8571428571428571,0.001736111111111111 | |
| aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,kendall,random,8,6,0.7857142857142856,0.005505952380952381 | |
| aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,kendall,random,8,7,0.7857142857142856,0.005505952380952381 | |
| aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,kendall,random,8,8,0.7857142857142856,0.005505952380952381 | |
| aggregate,aggregate,WildBench Reasoning & Planning,wildbench_240829.csv,kendall,random,8,9,0.7142857142857142,0.014136904761904762 | |
| aggregate,aggregate,WildBench Score,wildbench_240829.csv,kendall,random,8,0,0.7857142857142856,0.005505952380952381 | |
| aggregate,aggregate,WildBench Score,wildbench_240829.csv,kendall,random,8,1,0.7857142857142856,0.005505952380952381 | |
| aggregate,aggregate,WildBench Score,wildbench_240829.csv,kendall,random,8,2,0.7857142857142856,0.005505952380952381 | |
| aggregate,aggregate,WildBench Score,wildbench_240829.csv,kendall,random,8,3,0.8571428571428571,0.001736111111111111 | |
| aggregate,aggregate,WildBench Score,wildbench_240829.csv,kendall,random,8,4,0.7857142857142856,0.005505952380952381 | |
| aggregate,aggregate,WildBench Score,wildbench_240829.csv,kendall,random,8,5,0.8571428571428571,0.001736111111111111 | |
| aggregate,aggregate,WildBench Score,wildbench_240829.csv,kendall,random,8,6,0.7857142857142856,0.005505952380952381 | |
| aggregate,aggregate,WildBench Score,wildbench_240829.csv,kendall,random,8,7,0.7857142857142856,0.005505952380952381 | |
| aggregate,aggregate,WildBench Score,wildbench_240829.csv,kendall,random,8,8,0.7857142857142856,0.005505952380952381 | |
| aggregate,aggregate,WildBench Score,wildbench_240829.csv,kendall,random,8,9,0.7142857142857142,0.014136904761904762 | |
| aggregate,aggregate,Arena Hard,arena_hard_240829.csv,kendall,random,8,0,0.9285714285714285,0.0003968253968253968 | |
| aggregate,aggregate,Arena Hard,arena_hard_240829.csv,kendall,random,8,1,0.9285714285714285,0.0003968253968253968 | |
| aggregate,aggregate,Arena Hard,arena_hard_240829.csv,kendall,random,8,2,0.8571428571428571,0.001736111111111111 | |
| aggregate,aggregate,Arena Hard,arena_hard_240829.csv,kendall,random,8,3,0.8571428571428571,0.001736111111111111 | |
| aggregate,aggregate,Arena Hard,arena_hard_240829.csv,kendall,random,8,4,0.8571428571428571,0.001736111111111111 | |
| aggregate,aggregate,Arena Hard,arena_hard_240829.csv,kendall,random,8,5,0.7857142857142856,0.005505952380952381 | |
| aggregate,aggregate,Arena Hard,arena_hard_240829.csv,kendall,random,8,6,0.8571428571428571,0.001736111111111111 | |
| aggregate,aggregate,Arena Hard,arena_hard_240829.csv,kendall,random,8,7,0.8571428571428571,0.001736111111111111 | |
| aggregate,aggregate,Arena Hard,arena_hard_240829.csv,kendall,random,8,8,0.8571428571428571,0.001736111111111111 | |
| aggregate,aggregate,Arena Hard,arena_hard_240829.csv,kendall,random,8,9,0.8571428571428571,0.001736111111111111 | |
| aggregate,aggregate,LiveBench 240624,livebench_240701.csv,kendall,random,8,0,0.9285714285714285,0.0003968253968253968 | |
| aggregate,aggregate,LiveBench 240624,livebench_240701.csv,kendall,random,8,1,0.8571428571428571,0.001736111111111111 | |
| aggregate,aggregate,LiveBench 240624,livebench_240701.csv,kendall,random,8,2,0.9285714285714285,0.0003968253968253968 | |
| aggregate,aggregate,LiveBench 240624,livebench_240701.csv,kendall,random,8,3,0.9285714285714285,0.0003968253968253968 | |
| aggregate,aggregate,LiveBench 240624,livebench_240701.csv,kendall,random,8,4,0.9999999999999998,4.96031746031746e-05 | |
| aggregate,aggregate,LiveBench 240624,livebench_240701.csv,kendall,random,8,5,0.9285714285714285,0.0003968253968253968 | |
| aggregate,aggregate,LiveBench 240624,livebench_240701.csv,kendall,random,8,6,0.8571428571428571,0.001736111111111111 | |
| aggregate,aggregate,LiveBench 240624,livebench_240701.csv,kendall,random,8,7,0.8571428571428571,0.001736111111111111 | |
| aggregate,aggregate,LiveBench 240624,livebench_240701.csv,kendall,random,8,8,0.9285714285714285,0.0003968253968253968 | |
| aggregate,aggregate,LiveBench 240624,livebench_240701.csv,kendall,random,8,9,0.8571428571428571,0.001736111111111111 | |
| aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,kendall,random,8,0,0.6910233190806425,0.017844011512848347 | |
| aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,kendall,random,8,1,0.6428571428571428,0.03115079365079365 | |
| aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,kendall,random,8,2,0.6182840223353117,0.0340492747686748 | |
| aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,kendall,random,8,3,0.9285714285714285,0.0003968253968253968 | |
| aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,kendall,random,8,4,0.8571428571428571,0.001736111111111111 | |
| aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,kendall,random,8,5,0.7857142857142856,0.005505952380952381 | |
| aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,kendall,random,8,6,0.7142857142857142,0.014136904761904762 | |
| aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,kendall,random,8,7,0.7637626158259734,0.008839740160738534 | |
| aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,kendall,random,8,8,0.7637626158259734,0.008839740160738534 | |
| aggregate,aggregate,LiveBench Reasoning Average,livebench_240701.csv,kendall,random,8,9,0.7637626158259734,0.008839740160738534 | |
| aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,kendall,random,8,0,0.8571428571428571,0.001736111111111111 | |
| aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,kendall,random,8,1,0.7857142857142856,0.005505952380952381 | |
| aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,kendall,random,8,2,0.7142857142857142,0.014136904761904762 | |
| aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,kendall,random,8,3,0.8571428571428571,0.001736111111111111 | |
| aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,kendall,random,8,4,0.8571428571428571,0.001736111111111111 | |
| aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,kendall,random,8,5,0.7142857142857142,0.014136904761904762 | |
| aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,kendall,random,8,6,0.8571428571428571,0.001736111111111111 | |
| aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,kendall,random,8,7,0.6428571428571428,0.03115079365079365 | |
| aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,kendall,random,8,8,0.8571428571428571,0.001736111111111111 | |
| aggregate,aggregate,LiveBench Coding Average,livebench_240701.csv,kendall,random,8,9,0.8571428571428571,0.001736111111111111 | |
| aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,kendall,random,8,0,0.9285714285714285,0.0003968253968253968 | |
| aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,kendall,random,8,1,0.8571428571428571,0.001736111111111111 | |
| aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,kendall,random,8,2,0.8571428571428571,0.001736111111111111 | |
| aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,kendall,random,8,3,0.8571428571428571,0.001736111111111111 | |
| aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,kendall,random,8,4,0.8571428571428571,0.001736111111111111 | |
| aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,kendall,random,8,5,0.7857142857142856,0.005505952380952381 | |
| aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,kendall,random,8,6,0.9999999999999998,4.96031746031746e-05 | |
| aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,kendall,random,8,7,0.8571428571428571,0.001736111111111111 | |
| aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,kendall,random,8,8,0.9999999999999998,4.96031746031746e-05 | |
| aggregate,aggregate,LiveBench Mathematics Average,livebench_240701.csv,kendall,random,8,9,0.9999999999999998,4.96031746031746e-05 | |
| aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,kendall,random,8,0,0.7142857142857142,0.014136904761904762 | |
| aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,kendall,random,8,1,0.21428571428571427,0.5484126984126985 | |
| aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,kendall,random,8,2,0.6428571428571428,0.03115079365079365 | |
| aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,kendall,random,8,3,0.5714285714285714,0.06101190476190476 | |
| aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,kendall,random,8,4,0.4999999999999999,0.10868055555555556 | |
| aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,kendall,random,8,5,0.3571428571428571,0.27509920634920637 | |
| aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,kendall,random,8,6,0.42857142857142855,0.17886904761904762 | |
| aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,kendall,random,8,7,0.3571428571428571,0.27509920634920637 | |
| aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,kendall,random,8,8,0.5714285714285714,0.06101190476190476 | |
| aggregate,aggregate,LiveBench Data Analysis Average,livebench_240701.csv,kendall,random,8,9,0.42857142857142855,0.17886904761904762 | |
| aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,kendall,random,8,0,0.6428571428571428,0.03115079365079365 | |
| aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,kendall,random,8,1,0.6428571428571428,0.03115079365079365 | |
| aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,kendall,random,8,2,0.7142857142857142,0.014136904761904762 | |
| aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,kendall,random,8,3,0.5714285714285714,0.06101190476190476 | |
| aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,kendall,random,8,4,0.7857142857142856,0.005505952380952381 | |
| aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,kendall,random,8,5,0.5714285714285714,0.06101190476190476 | |
| aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,kendall,random,8,6,0.6428571428571428,0.03115079365079365 | |
| aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,kendall,random,8,7,0.5714285714285714,0.06101190476190476 | |
| aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,kendall,random,8,8,0.5714285714285714,0.06101190476190476 | |
| aggregate,aggregate,LiveBench Language Average,livebench_240701.csv,kendall,random,8,9,0.7142857142857142,0.014136904761904762 | |
| aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,kendall,random,8,0,0.7857142857142856,0.005505952380952381 | |
| aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,kendall,random,8,1,0.7142857142857142,0.014136904761904762 | |
| aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,kendall,random,8,2,0.6428571428571428,0.03115079365079365 | |
| aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,kendall,random,8,3,0.7142857142857142,0.014136904761904762 | |
| aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,kendall,random,8,4,0.8571428571428571,0.001736111111111111 | |
| aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,kendall,random,8,5,0.6428571428571428,0.03115079365079365 | |
| aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,kendall,random,8,6,0.5714285714285714,0.06101190476190476 | |
| aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,kendall,random,8,7,0.6428571428571428,0.03115079365079365 | |
| aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,kendall,random,8,8,0.8571428571428571,0.001736111111111111 | |
| aggregate,aggregate,LiveBench Instruction Following Average,livebench_240701.csv,kendall,random,8,9,0.7142857142857142,0.014136904761904762 | |