[ { "config": { "model_name": "ChatGPT-4o-latest (2024-09-03)", "organization": "OpenAI", "license": "Proprietary", "knowledge_cutoff": "2023/10" }, "results": { "OVERALL": { "Average Score": 93.51557945652831, "Standard Deviation": 3.1900396436407785, "Rank": 4 }, "Geometry": { "Average Score": 81.8536937387725, "Standard Deviation": null, "Rank": 5 }, "Algebra": { "Average Score": 89.3642910524324, "Standard Deviation": null, "Rank": 3 }, "Probability": { "Average Score": 86.55761073510537, "Standard Deviation": null, "Rank": 4 }, "Logical": { "Average Score": 97.39734315785844, "Standard Deviation": null, "Rank": 2 }, "Social": { "Average Score": 91.03727530739368, "Standard Deviation": null, "Rank": 7 }, "Chemistry": { "Average Score": 100.0, "Standard Deviation": null, "Rank": 1 }, "CPP": { "Average Score": 100.0, "Standard Deviation": null, "Rank": 1 } } }, { "config": { "model_name": "gpt-4o-2024-08-06", "organization": "OpenAI", "license": "Proprietary", "knowledge_cutoff": "2023/10" }, "results": { "OVERALL": { "Average Score": 79.7806321863411, "Standard Deviation": 0.8302330946013555, "Rank": 14 }, "Geometry": { "Average Score": 86.29041459755453, "Standard Deviation": null, "Rank": 2 }, "Algebra": { "Average Score": 88.53373721863113, "Standard Deviation": null, "Rank": 4 }, "Probability": { "Average Score": 78.694360721361, "Standard Deviation": null, "Rank": 7 }, "Logical": { "Average Score": 78.3116623496895, "Standard Deviation": null, "Rank": 12 }, "Social": { "Average Score": 79.90944696263446, "Standard Deviation": null, "Rank": 11 }, "Chemistry": { "Average Score": 86.96011263543132, "Standard Deviation": null, "Rank": 7 }, "CPP": { "Average Score": 92.43090226400756, "Standard Deviation": null, "Rank": 2 } } }, { "config": { "model_name": "gpt-4o-2024-05-13", "organization": "OpenAI", "license": "Proprietary", "knowledge_cutoff": "2023/10" }, "results": { "OVERALL": { "Average Score": 86.40675398236253, "Standard Deviation": 6.473604235710212, "Rank": 9 }, "Geometry": { "Average Score": 82.42032988843268, "Standard Deviation": null, "Rank": 4 }, "Algebra": { "Average Score": 83.51580675782952, "Standard Deviation": null, "Rank": 9 }, "Probability": { "Average Score": 81.88434691830915, "Standard Deviation": null, "Rank": 5 }, "Logical": { "Average Score": 87.92744931984977, "Standard Deviation": null, "Rank": 9 }, "Social": { "Average Score": 76.12369632852445, "Standard Deviation": null, "Rank": 15 }, "Chemistry": { "Average Score": 90.93459148149344, "Standard Deviation": null, "Rank": 4 }, "CPP": { "Average Score": 79.1592634699295, "Standard Deviation": null, "Rank": 6 } } }, { "config": { "model_name": "gpt-4-turbo-2024-04-09", "organization": "OpenAI", "license": "Proprietary", "knowledge_cutoff": "2023/12" }, "results": { "OVERALL": { "Average Score": 87.17581147282237, "Standard Deviation": 8.716963621850567, "Rank": 8 }, "Geometry": { "Average Score": 78.76635545274637, "Standard Deviation": null, "Rank": 7 }, "Algebra": { "Average Score": 79.96323615621023, "Standard Deviation": null, "Rank": 11 }, "Probability": { "Average Score": 77.65333799733705, "Standard Deviation": null, "Rank": 9 }, "Logical": { "Average Score": 89.33307138659873, "Standard Deviation": null, "Rank": 8 }, "Social": { "Average Score": 76.86597570996584, "Standard Deviation": null, "Rank": 14 }, "Chemistry": { "Average Score": 84.02855687506661, "Standard Deviation": null, "Rank": 9 }, "CPP": { "Average Score": 70.73143363230263, "Standard Deviation": null, "Rank": 11 } } }, { "config": { "model_name": "gemini-1.5-pro-001", "organization": "Google", "license": "Proprietary", "knowledge_cutoff": "2023/11" }, "results": { "OVERALL": { "Average Score": 80.38345723548734, "Standard Deviation": 2.4635699815143584, "Rank": 13 }, "Geometry": { "Average Score": 84.30455076458965, "Standard Deviation": null, "Rank": 3 }, "Algebra": { "Average Score": 85.9212061409364, "Standard Deviation": null, "Rank": 6 }, "Probability": { "Average Score": 73.11806712394745, "Standard Deviation": null, "Rank": 13 }, "Logical": { "Average Score": 78.27369746632996, "Standard Deviation": null, "Rank": 12 }, "Social": { "Average Score": 79.57606824531047, "Standard Deviation": null, "Rank": 13 } } }, { "config": { "model_name": "qwen2-72b-instruct", "organization": "Alibaba", "license": "Qianwen LICENSE", "knowledge_cutoff": "2024/09" }, "results": { "OVERALL": { "Average Score": 74.44059692248071, "Standard Deviation": 2.3957041566666697, "Rank": 16 }, "Geometry": { "Average Score": 72.58490369919883, "Standard Deviation": null, "Rank": 11 }, "Algebra": { "Average Score": 88.53359632761772, "Standard Deviation": null, "Rank": 4 }, "Probability": { "Average Score": 80.19789976985243, "Standard Deviation": null, "Rank": 6 }, "Logical": { "Average Score": 72.76843081200641, "Standard Deviation": null, "Rank": 17 }, "Social": { "Average Score": 57.256064868444426, "Standard Deviation": null, "Rank": 19 }, "Chemistry": { "Average Score": 75.47190401351077, "Standard Deviation": null, "Rank": 12 }, "CPP": { "Average Score": 73.54037778797029, "Standard Deviation": null, "Rank": 7 } } }, { "config": { "model_name": "gpt-4o-mini-2024-07-18", "organization": "OpenAI", "license": "Proprietary", "knowledge_cutoff": "2023/10" }, "results": { "OVERALL": { "Average Score": 82.82456893277315, "Standard Deviation": 7.714840109805867, "Rank": 12 }, "Geometry": { "Average Score": 78.89323869622943, "Standard Deviation": null, "Rank": 6 }, "Algebra": { "Average Score": 84.8722603687823, "Standard Deviation": null, "Rank": 8 }, "Probability": { "Average Score": 78.6942843346463, "Standard Deviation": null, "Rank": 7 }, "Logical": { "Average Score": 85.68921109829361, "Standard Deviation": null, "Rank": 10 }, "Social": { "Average Score": 81.79892848722542, "Standard Deviation": null, "Rank": 10 }, "Chemistry": { "Average Score": 81.46805623180109, "Standard Deviation": null, "Rank": 10 }, "CPP": { "Average Score": 88.3877070580296, "Standard Deviation": null, "Rank": 3 } } }, { "config": { "model_name": "claude-3.5-sonnet", "organization": "Anthropic", "license": "Proprietary", "knowledge_cutoff": "2024/04" }, "results": { "OVERALL": { "Average Score": 88.43557924843628, "Standard Deviation": 5.680338106806327, "Rank": 7 }, "Geometry": { "Average Score": 76.26169400931595, "Standard Deviation": null, "Rank": 10 }, "Algebra": { "Average Score": 77.15040433072186, "Standard Deviation": null, "Rank": 13 }, "Probability": { "Average Score": 73.9942759783754, "Standard Deviation": null, "Rank": 11 }, "Logical": { "Average Score": 89.70827617930533, "Standard Deviation": null, "Rank": 7 }, "Social": { "Average Score": 97.3810636467068, "Standard Deviation": null, "Rank": 3 }, "Chemistry": { "Average Score": 94.92819763202698, "Standard Deviation": null, "Rank": 3 }, "CPP": { "Average Score": 82.37734076815008, "Standard Deviation": null, "Rank": 5 } } }, { "config": { "model_name": "o1-mini", "organization": "OpenAI", "license": "Proprietary", "knowledge_cutoff": "2023/10" }, "results": { "OVERALL": { "Average Score": 96.12399889226096, "Standard Deviation": 0.5674965705992511, "Rank": 2 }, "Geometry": { "Average Score": 100.0, "Standard Deviation": null, "Rank": 1 }, "Algebra": { "Average Score": 100.0, "Standard Deviation": null, "Rank": 1 }, "Probability": { "Average Score": 100.0, "Standard Deviation": null, "Rank": 1 }, "Logical": { "Average Score": 96.52089445393929, "Standard Deviation": null, "Rank": 3 }, "Social": { "Average Score": 95.00695256918654, "Standard Deviation": null, "Rank": 5 } } }, { "config": { "model_name": "o1-preview", "organization": "OpenAI", "license": "Proprietary", "knowledge_cutoff": "2023/10" }, "results": { "OVERALL": { "Average Score": 91.08240629161766, "Standard Deviation": 4.83378135710071, "Rank": 5 }, "Geometry": { "Average Score": "N/A", "Standard Deviation": "N/A", "Rank": "N/A" }, "Algebra": { "Average Score": 98.1870991822192, "Standard Deviation": null, "Rank": 2 }, "Probability": { "Average Score": 94.12657646584134, "Standard Deviation": null, "Rank": 2 }, "Logical": { "Average Score": 100.0, "Standard Deviation": null, "Rank": 1 }, "Social": { "Average Score": 96.56802743955569, "Standard Deviation": null, "Rank": 4 } } }, { "config": { "model_name": "gemini-1.5-flash-001", "organization": "Google", "license": "Proprietary", "knowledge_cutoff": "2023/11" }, "results": { "OVERALL": { "Average Score": 66.25275609135964, "Standard Deviation": 2.5314573702881438, "Rank": 20 }, "Geometry": { "Average Score": 66.8010242138006, "Standard Deviation": null, "Rank": 13 }, "Algebra": { "Average Score": 78.24639082497596, "Standard Deviation": null, "Rank": 12 }, "Probability": { "Average Score": 67.84602916736804, "Standard Deviation": null, "Rank": 15 }, "Logical": { "Average Score": 72.76845749138818, "Standard Deviation": null, "Rank": 17 }, "Social": { "Average Score": 68.57728479711058, "Standard Deviation": null, "Rank": 16 }, "Chemistry": { "Average Score": 75.47188329078935, "Standard Deviation": null, "Rank": 12 }, "CPP": { "Average Score": 72.1127762005651, "Standard Deviation": null, "Rank": 10 } } }, { "config": { "model_name": "gpt4-1106", "organization": "OpenAI", "license": "Proprietary", "knowledge_cutoff": "2024/04" }, "results": { "OVERALL": { "Average Score": 85.660054434658, "Standard Deviation": 7.392502344300497, "Rank": 10 }, "Geometry": { "Average Score": 63.36396165140893, "Standard Deviation": null, "Rank": 15 }, "Algebra": { "Average Score": 74.67191687355754, "Standard Deviation": null, "Rank": 15 }, "Probability": { "Average Score": 71.35141952665965, "Standard Deviation": null, "Rank": 14 }, "Logical": { "Average Score": 76.34506017196868, "Standard Deviation": null, "Rank": 15 }, "Social": { "Average Score": 46.00126575332808, "Standard Deviation": null, "Rank": 25 }, "Chemistry": { "Average Score": 78.70156756289569, "Standard Deviation": null, "Rank": 11 }, "CPP": { "Average Score": 69.11824072252848, "Standard Deviation": null, "Rank": 12 } } }, { "config": { "model_name": "gemma-2-27b-it", "organization": "Google", "license": "Gemma License", "knowledge_cutoff": "2024/06" }, "results": { "OVERALL": { "Average Score": 70.82622192650408, "Standard Deviation": 0.18962869075029884, "Rank": 18 }, "Geometry": { "Average Score": 58.25724467150374, "Standard Deviation": null, "Rank": 16 }, "Algebra": { "Average Score": 73.71614711121721, "Standard Deviation": null, "Rank": 16 }, "Probability": { "Average Score": 66.08200742339983, "Standard Deviation": null, "Rank": 17 }, "Logical": { "Average Score": 72.76841354275011, "Standard Deviation": null, "Rank": 17 }, "Social": { "Average Score": 53.736358144621576, "Standard Deviation": null, "Rank": 21 }, "Chemistry": { "Average Score": 68.1178055540124, "Standard Deviation": null, "Rank": 17 }, "CPP": { "Average Score": 63.28920072143611, "Standard Deviation": null, "Rank": 14 } } }, { "config": { "model_name": "claude-3-opus", "organization": "Anthropic", "license": "Proprietary", "knowledge_cutoff": "2023/08" }, "results": { "OVERALL": { "Average Score": 82.28903171580336, "Standard Deviation": 10.093273304495547, "Rank": 11 }, "Geometry": { "Average Score": 57.98602891013921, "Standard Deviation": null, "Rank": 17 }, "Algebra": { "Average Score": 73.54334730242743, "Standard Deviation": null, "Rank": 18 }, "Probability": { "Average Score": 67.8341594991468, "Standard Deviation": null, "Rank": 15 }, "Logical": { "Average Score": 78.31155849680502, "Standard Deviation": null, "Rank": 12 }, "Social": { "Average Score": 90.45833112761075, "Standard Deviation": null, "Rank": 8 }, "Chemistry": { "Average Score": 85.97349470177741, "Standard Deviation": null, "Rank": 8 }, "CPP": { "Average Score": 73.5404403567132, "Standard Deviation": null, "Rank": 8 } } }, { "config": { "model_name": "gemma-2-9b-it-simpo", "organization": "Google", "license": "Gemma License", "knowledge_cutoff": "2024/07" }, "results": { "OVERALL": { "Average Score": "N/A", "Standard Deviation": "N/A", "Rank": "N/A" }, "Geometry": { "Average Score": 52.80896798216458, "Standard Deviation": null, "Rank": 19 }, "Algebra": { "Average Score": 69.60260038105677, "Standard Deviation": null, "Rank": 19 }, "Probability": { "Average Score": 59.52630271491633, "Standard Deviation": null, "Rank": 21 }, "Logical": { "Average Score": 63.57920031465781, "Standard Deviation": null, "Rank": 23 }, "Social": { "Average Score": 79.90950201631269, "Standard Deviation": null, "Rank": 11 }, "Chemistry": { "Average Score": 90.36508196626548, "Standard Deviation": null, "Rank": 5 }, "CPP": { "Average Score": 73.43757596214863, "Standard Deviation": null, "Rank": 9 } } }, { "config": { "model_name": "qwen1.5-72b-chat", "organization": "Alibaba", "license": "Qianwen LICENSE", "knowledge_cutoff": "2024/03" }, "results": { "OVERALL": { "Average Score": 65.26710370586439, "Standard Deviation": 9.198700753743012, "Rank": 19 }, "Geometry": { "Average Score": 48.52417714351894, "Standard Deviation": null, "Rank": 24 }, "Algebra": { "Average Score": 68.55765479604507, "Standard Deviation": null, "Rank": 20 }, "Probability": { "Average Score": 49.52382148131357, "Standard Deviation": null, "Rank": 26 }, "Logical": { "Average Score": 37.33563924001827, "Standard Deviation": null, "Rank": 35 }, "Social": { "Average Score": 46.00141195402727, "Standard Deviation": null, "Rank": 25 }, "Chemistry": { "Average Score": 52.625823960166215, "Standard Deviation": null, "Rank": 23 }, "CPP": { "Average Score": 48.69302376665551, "Standard Deviation": null, "Rank": 20 } } }, { "config": { "model_name": "qwen1.5-32b-chat", "organization": "Alibaba", "license": "Qianwen LICENSE", "knowledge_cutoff": "2024/03" }, "results": { "OVERALL": { "Average Score": 46.74335731441104, "Standard Deviation": 4.096227849530709, "Rank": 28 }, "Geometry": { "Average Score": 44.96670224519297, "Standard Deviation": null, "Rank": 26 }, "Algebra": { "Average Score": 63.19715848628476, "Standard Deviation": null, "Rank": 23 }, "Probability": { "Average Score": 48.59873650270336, "Standard Deviation": null, "Rank": 27 }, "Logical": { "Average Score": 42.028753105249216, "Standard Deviation": null, "Rank": 33 }, "Social": { "Average Score": 43.183938768454986, "Standard Deviation": null, "Rank": 28 }, "Chemistry": { "Average Score": 47.84488021045937, "Standard Deviation": null, "Rank": 26 }, "CPP": { "Average Score": 45.14284028264288, "Standard Deviation": null, "Rank": 24 } } }, { "config": { "model_name": "google-gemma-2-9b-it", "organization": "Google", "license": "Proprietary", "knowledge_cutoff": "2024/06" }, "results": { "OVERALL": { "Average Score": 60.71065949101693, "Standard Deviation": 0.12283018509137462, "Rank": 23 }, "Geometry": { "Average Score": 52.49270527783856, "Standard Deviation": null, "Rank": 20 }, "Algebra": { "Average Score": 63.446032975128176, "Standard Deviation": null, "Rank": 21 }, "Probability": { "Average Score": 63.95287475488081, "Standard Deviation": null, "Rank": 20 }, "Logical": { "Average Score": 70.18644584116615, "Standard Deviation": null, "Rank": 20 }, "Social": { "Average Score": 86.45401862572464, "Standard Deviation": null, "Rank": 9 }, "Chemistry": { "Average Score": 57.56342217758078, "Standard Deviation": null, "Rank": 20 }, "CPP": { "Average Score": 54.03167523687635, "Standard Deviation": null, "Rank": 17 } } }, { "config": { "model_name": "yi-1.5-34b-chat", "organization": "01 AI", "license": "Proprietary", "knowledge_cutoff": "2024/05" }, "results": { "OVERALL": { "Average Score": 71.53811567931923, "Standard Deviation": 0.4838075734512934, "Rank": 17 }, "Geometry": { "Average Score": 53.98343904373819, "Standard Deviation": null, "Rank": 18 }, "Algebra": { "Average Score": 63.317896075817885, "Standard Deviation": null, "Rank": 22 }, "Probability": { "Average Score": 64.73492918491159, "Standard Deviation": null, "Rank": 19 }, "Logical": { "Average Score": 66.39420245024361, "Standard Deviation": null, "Rank": 21 }, "Social": { "Average Score": 53.73650350964252, "Standard Deviation": null, "Rank": 21 }, "Chemistry": { "Average Score": 56.722360677914686, "Standard Deviation": null, "Rank": 21 }, "CPP": { "Average Score": 52.148798061768964, "Standard Deviation": null, "Rank": 18 } } }, { "config": { "model_name": "meta-llama-3.1-70b-instruct", "organization": "Meta", "license": "Llama 3.1 Community", "knowledge_cutoff": "2023/12" }, "results": { "OVERALL": { "Average Score": 74.01502078434305, "Standard Deviation": 0.24116839515156926, "Rank": 15 }, "Geometry": { "Average Score": 66.80097850274383, "Standard Deviation": null, "Rank": 13 }, "Algebra": { "Average Score": 74.7667367179752, "Standard Deviation": null, "Rank": 14 }, "Probability": { "Average Score": 66.0819470113051, "Standard Deviation": null, "Rank": 17 }, "Logical": { "Average Score": 73.68238947162197, "Standard Deviation": null, "Rank": 16 }, "Social": { "Average Score": 68.577541438994, "Standard Deviation": null, "Rank": 16 }, "Chemistry": { "Average Score": 70.4019514562452, "Standard Deviation": null, "Rank": 15 }, "CPP": { "Average Score": 84.36815192532764, "Standard Deviation": null, "Rank": 4 } } }, { "config": { "model_name": "meta-llama-3.1-8b-instruct", "organization": "Meta", "license": "Llama 3.1 Community", "knowledge_cutoff": "2023/12" }, "results": { "OVERALL": { "Average Score": 55.268736955905695, "Standard Deviation": 7.060517225126177, "Rank": 26 }, "Geometry": { "Average Score": 42.44262022417502, "Standard Deviation": null, "Rank": 28 }, "Algebra": { "Average Score": 60.632347391080486, "Standard Deviation": null, "Rank": 25 }, "Probability": { "Average Score": 52.372362507453694, "Standard Deviation": null, "Rank": 24 }, "Logical": { "Average Score": 54.17571378414435, "Standard Deviation": null, "Rank": 28 }, "Social": { "Average Score": 39.07966801070027, "Standard Deviation": null, "Rank": 31 }, "Chemistry": { "Average Score": 45.0170262190059, "Standard Deviation": null, "Rank": 29 }, "CPP": { "Average Score": 44.41846841004584, "Standard Deviation": null, "Rank": 26 } } }, { "config": { "model_name": "gpt3.5-turbo-0125", "organization": "OpenAI", "license": "Proprietary", "knowledge_cutoff": "2021/09" }, "results": { "OVERALL": { "Average Score": 29.17379433602279, "Standard Deviation": 2.6813415847393878, "Rank": 44 }, "Geometry": { "Average Score": 51.47279337094397, "Standard Deviation": null, "Rank": 21 }, "Algebra": { "Average Score": 59.03601450977881, "Standard Deviation": null, "Rank": 26 }, "Probability": { "Average Score": 46.71541304474977, "Standard Deviation": null, "Rank": 28 }, "Logical": { "Average Score": 20.82026871015984, "Standard Deviation": null, "Rank": 46 }, "Social": { "Average Score": 28.31096293069848, "Standard Deviation": null, "Rank": 41 }, "Chemistry": { "Average Score": 42.899594571904004, "Standard Deviation": null, "Rank": 31 }, "CPP": { "Average Score": 40.46958736582551, "Standard Deviation": null, "Rank": 29 } } }, { "config": { "model_name": "llama-3-70b-instruct", "organization": "Meta", "license": "Llama 3 Community", "knowledge_cutoff": "2023/12" }, "results": { "OVERALL": { "Average Score": 65.90407336557487, "Standard Deviation": 66.63940143516267, "Rank": 24 }, "Geometry": { "Average Score": 46.40555349958932, "Standard Deviation": null, "Rank": 25 }, "Algebra": { "Average Score": 60.86276607976933, "Standard Deviation": null, "Rank": 24 }, "Probability": { "Average Score": 55.0233135868055, "Standard Deviation": null, "Rank": 22 }, "Logical": { "Average Score": 83.99546392889077, "Standard Deviation": null, "Rank": 11 }, "Social": { "Average Score": 47.90189246663785, "Standard Deviation": null, "Rank": 23 }, "Chemistry": { "Average Score": 70.40198909396582, "Standard Deviation": null, "Rank": 15 }, "CPP": { "Average Score": 65.32140697218945, "Standard Deviation": null, "Rank": 13 } } }, { "config": { "model_name": "claude-3-sonnet", "organization": "Anthropic", "license": "Proprietary", "knowledge_cutoff": "2023/08" }, "results": { "OVERALL": { "Average Score": 64.4278622266347, "Standard Deviation": 3.089828107392469, "Rank": 21 }, "Geometry": { "Average Score": 51.4677627365698, "Standard Deviation": null, "Rank": 21 }, "Algebra": { "Average Score": 57.157810499255426, "Standard Deviation": null, "Rank": 27 }, "Probability": { "Average Score": 54.68761427070592, "Standard Deviation": null, "Rank": 23 }, "Logical": { "Average Score": 65.8346271849297, "Standard Deviation": null, "Rank": 22 }, "Social": { "Average Score": 62.842721798877186, "Standard Deviation": null, "Rank": 18 }, "Chemistry": { "Average Score": 66.1914400411681, "Standard Deviation": null, "Rank": 18 }, "CPP": { "Average Score": 61.33538592327427, "Standard Deviation": null, "Rank": 15 } } }, { "config": { "model_name": "qwen1.5-14b-chat", "organization": "Alibaba", "license": "Qianwen LICENSE", "knowledge_cutoff": "2024/02" }, "results": { "OVERALL": { "Average Score": 44.920016997055804, "Standard Deviation": 0.3041914765974254, "Rank": 30 }, "Geometry": { "Average Score": 36.40735570120079, "Standard Deviation": null, "Rank": 30 }, "Algebra": { "Average Score": 56.004717588310726, "Standard Deviation": null, "Rank": 28 }, "Probability": { "Average Score": 39.24866255465088, "Standard Deviation": null, "Rank": 33 }, "Logical": { "Average Score": 35.15462916949486, "Standard Deviation": null, "Rank": 38 }, "Social": { "Average Score": 35.236185321936766, "Standard Deviation": null, "Rank": 34 }, "Chemistry": { "Average Score": 40.803706763362605, "Standard Deviation": null, "Rank": 34 }, "CPP": { "Average Score": 38.552779976347026, "Standard Deviation": null, "Rank": 31 } } }, { "config": { "model_name": "claude-3-haiku", "organization": "Anthropic", "license": "Proprietary", "knowledge_cutoff": "2023/08" }, "results": { "OVERALL": { "Average Score": 53.46814061793852, "Standard Deviation": 10.143567097006747, "Rank": 25 }, "Geometry": { "Average Score": 42.87542087805953, "Standard Deviation": null, "Rank": 27 }, "Algebra": { "Average Score": 53.706856083803686, "Standard Deviation": null, "Rank": 30 }, "Probability": { "Average Score": 49.80372052799326, "Standard Deviation": null, "Rank": 25 }, "Logical": { "Average Score": 62.585349577709394, "Standard Deviation": null, "Rank": 24 }, "Social": { "Average Score": 57.25601125762336, "Standard Deviation": null, "Rank": 19 }, "Chemistry": { "Average Score": 60.48921113945562, "Standard Deviation": null, "Rank": 19 }, "CPP": { "Average Score": 56.40200048817984, "Standard Deviation": null, "Rank": 16 } } }, { "config": { "model_name": "claude-2.1", "organization": "Anthropic", "license": "Proprietary", "knowledge_cutoff": "Unknown" }, "results": { "OVERALL": { "Average Score": 39.855928282633364, "Standard Deviation": 8.396129652430814, "Rank": 35 }, "Geometry": { "Average Score": 51.1749207092159, "Standard Deviation": null, "Rank": 23 }, "Algebra": { "Average Score": 53.05386216145516, "Standard Deviation": null, "Rank": 31 }, "Probability": { "Average Score": 44.42150447611455, "Standard Deviation": null, "Rank": 30 }, "Logical": { "Average Score": 60.51381867118053, "Standard Deviation": null, "Rank": 25 }, "Social": { "Average Score": 38.492280755756035, "Standard Deviation": null, "Rank": 32 }, "Chemistry": { "Average Score": 50.66182745698702, "Standard Deviation": null, "Rank": 24 }, "CPP": { "Average Score": 47.23672563994903, "Standard Deviation": null, "Rank": 21 } } }, { "config": { "model_name": "mistral-8x7b-instruct-v0.1", "organization": "Mistral", "license": "Apache 2.0", "knowledge_cutoff": "2023/12" }, "results": { "OVERALL": { "Average Score": 42.70451051343715, "Standard Deviation": 9.965602920103015, "Rank": 31 }, "Geometry": { "Average Score": 33.473933494899164, "Standard Deviation": null, "Rank": 34 }, "Algebra": { "Average Score": 48.99207852115047, "Standard Deviation": null, "Rank": 34 }, "Probability": { "Average Score": 44.46936520340586, "Standard Deviation": null, "Rank": 30 }, "Logical": { "Average Score": 42.656238987207246, "Standard Deviation": null, "Rank": 31 }, "Social": { "Average Score": 30.32900110312259, "Standard Deviation": null, "Rank": 40 }, "Chemistry": { "Average Score": 47.047104057571026, "Standard Deviation": null, "Rank": 27 }, "CPP": { "Average Score": 44.533118241976666, "Standard Deviation": null, "Rank": 25 } } }, { "config": { "model_name": "claude-2.0", "organization": "Anthropic", "license": "Proprietary", "knowledge_cutoff": "Unknown" }, "results": { "OVERALL": { "Average Score": 33.53990717968659, "Standard Deviation": 7.640386327990536, "Rank": 41 }, "Geometry": { "Average Score": 38.40953902052666, "Standard Deviation": null, "Rank": 29 }, "Algebra": { "Average Score": 49.07235259762855, "Standard Deviation": null, "Rank": 33 }, "Probability": { "Average Score": 46.71546649299419, "Standard Deviation": null, "Rank": 28 }, "Logical": { "Average Score": 56.26908965013192, "Standard Deviation": null, "Rank": 27 }, "Social": { "Average Score": 47.84034165469707, "Standard Deviation": null, "Rank": 23 }, "Chemistry": { "Average Score": 55.20362543510563, "Standard Deviation": null, "Rank": 22 }, "CPP": { "Average Score": 50.773143448036464, "Standard Deviation": null, "Rank": 19 } } }, { "config": { "model_name": "starling-lm-7b-beta", "organization": "Nexusflow", "license": "Apache-2.0", "knowledge_cutoff": "2024/03" }, "results": { "OVERALL": { "Average Score": 50.90398580969381, "Standard Deviation": 0.2839403187065694, "Rank": 27 }, "Geometry": { "Average Score": 34.653904247826965, "Standard Deviation": null, "Rank": 33 }, "Algebra": { "Average Score": 49.66265150940668, "Standard Deviation": null, "Rank": 32 }, "Probability": { "Average Score": 40.04695085773174, "Standard Deviation": null, "Rank": 32 }, "Logical": { "Average Score": 48.02284849364292, "Standard Deviation": null, "Rank": 29 }, "Social": { "Average Score": 42.82322308642107, "Standard Deviation": null, "Rank": 29 }, "Chemistry": { "Average Score": 40.54467030566931, "Standard Deviation": null, "Rank": 35 }, "CPP": { "Average Score": 38.27587102395908, "Standard Deviation": null, "Rank": 32 } } }, { "config": { "model_name": "gemini-1.0-pro-001", "organization": "Google", "license": "Proprietary", "knowledge_cutoff": "2023/04" }, "results": { "OVERALL": { "Average Score": 37.91102687366529, "Standard Deviation": 15.15111885239772, "Rank": 38 }, "Geometry": { "Average Score": 35.480853719259684, "Standard Deviation": null, "Rank": 32 }, "Algebra": { "Average Score": 48.08542847805497, "Standard Deviation": null, "Rank": 35 }, "Probability": { "Average Score": 29.862669786973395, "Standard Deviation": null, "Rank": 42 }, "Logical": { "Average Score": 24.141794297157134, "Standard Deviation": null, "Rank": 43 }, "Social": { "Average Score": 15.062345665891504, "Standard Deviation": null, "Rank": 51 }, "Chemistry": { "Average Score": 46.52522766257804, "Standard Deviation": null, "Rank": 28 }, "CPP": { "Average Score": 45.22204471452975, "Standard Deviation": null, "Rank": 23 } } }, { "config": { "model_name": "openchat-3.5-0106", "organization": "OpenChat", "license": "Apache-2.0", "knowledge_cutoff": "2024/01" }, "results": { "OVERALL": { "Average Score": 41.34314082389491, "Standard Deviation": 4.394481877390224, "Rank": 32 }, "Geometry": { "Average Score": 29.859015723426758, "Standard Deviation": null, "Rank": 36 }, "Algebra": { "Average Score": 45.79428201943078, "Standard Deviation": null, "Rank": 36 }, "Probability": { "Average Score": 38.766888608782956, "Standard Deviation": null, "Rank": 34 }, "Logical": { "Average Score": 42.1345774485532, "Standard Deviation": null, "Rank": 32 }, "Social": { "Average Score": 32.07155544930587, "Standard Deviation": null, "Rank": 39 }, "Chemistry": { "Average Score": 35.28601797606463, "Standard Deviation": null, "Rank": 37 }, "CPP": { "Average Score": 33.70639271807677, "Standard Deviation": null, "Rank": 33 } } }, { "config": { "model_name": "openchat-3.5", "organization": "OpenChat", "license": "Apache-2.0", "knowledge_cutoff": "2023/11" }, "results": { "OVERALL": { "Average Score": 39.60454188051808, "Standard Deviation": 0.8232501722386516, "Rank": 36 }, "Geometry": { "Average Score": 30.77657388742533, "Standard Deviation": null, "Rank": 35 }, "Algebra": { "Average Score": 42.13028451761782, "Standard Deviation": null, "Rank": 38 }, "Probability": { "Average Score": 34.817635171077754, "Standard Deviation": null, "Rank": 37 }, "Logical": { "Average Score": 36.21944706732088, "Standard Deviation": null, "Rank": 36 }, "Social": { "Average Score": 37.59265084241427, "Standard Deviation": null, "Rank": 33 }, "Chemistry": { "Average Score": 37.21911183748652, "Standard Deviation": null, "Rank": 36 }, "CPP": { "Average Score": 33.020911255646965, "Standard Deviation": null, "Rank": 34 } } }, { "config": { "model_name": "command-r-(08-2024)", "organization": "Cohere", "license": "CC-BY-NC-4.0", "knowledge_cutoff": "2024/08" }, "results": { "OVERALL": { "Average Score": 45.84310421663912, "Standard Deviation": 0.14535750785421472, "Rank": 29 }, "Geometry": { "Average Score": 36.33550343578038, "Standard Deviation": null, "Rank": 31 }, "Algebra": { "Average Score": 41.87079446639028, "Standard Deviation": null, "Rank": 39 }, "Probability": { "Average Score": 36.87662939858684, "Standard Deviation": null, "Rank": 36 }, "Logical": { "Average Score": 26.22482921268266, "Standard Deviation": null, "Rank": 41 }, "Social": { "Average Score": 35.11019761697373, "Standard Deviation": null, "Rank": 35 }, "Chemistry": { "Average Score": 41.81772722027254, "Standard Deviation": null, "Rank": 33 }, "CPP": { "Average Score": 39.61492485677676, "Standard Deviation": null, "Rank": 30 } } }, { "config": { "model_name": "gemma-1.1-7b-it", "organization": "Google", "license": "Gemma License", "knowledge_cutoff": "2024/02" }, "results": { "OVERALL": { "Average Score": 35.873210924652795, "Standard Deviation": 6.462625645064649, "Rank": 37 }, "Geometry": { "Average Score": 25.79207201693066, "Standard Deviation": null, "Rank": 40 }, "Algebra": { "Average Score": 40.58046616460041, "Standard Deviation": null, "Rank": 40 }, "Probability": { "Average Score": 29.581773053230897, "Standard Deviation": null, "Rank": 43 }, "Logical": { "Average Score": 41.99821650962693, "Standard Deviation": null, "Rank": 33 }, "Social": { "Average Score": 24.39015213949678, "Standard Deviation": null, "Rank": 43 }, "Chemistry": { "Average Score": 45.01706482033765, "Standard Deviation": null, "Rank": 29 }, "CPP": { "Average Score": 42.666504105798204, "Standard Deviation": null, "Rank": 27 } } }, { "config": { "model_name": "llama3-8b-instruct", "organization": "Meta", "license": "Llama 3 Community", "knowledge_cutoff": "2023/03" }, "results": { "OVERALL": { "Average Score": 39.00917270775336, "Standard Deviation": 3.999506140299149, "Rank": 39 }, "Geometry": { "Average Score": 29.224089668837465, "Standard Deviation": null, "Rank": 38 }, "Algebra": { "Average Score": 42.90961619082775, "Standard Deviation": null, "Rank": 37 }, "Probability": { "Average Score": 34.15721355738147, "Standard Deviation": null, "Rank": 38 }, "Logical": { "Average Score": 58.39773915370141, "Standard Deviation": null, "Rank": 26 }, "Social": { "Average Score": 40.88535401371015, "Standard Deviation": null, "Rank": 30 }, "Chemistry": { "Average Score": 49.70839372661025, "Standard Deviation": null, "Rank": 25 }, "CPP": { "Average Score": 45.35392139264795, "Standard Deviation": null, "Rank": 22 } } }, { "config": { "model_name": "gemma-2-2b-it", "organization": "Google", "license": "Gemma License", "knowledge_cutoff": "2024/07" }, "results": { "OVERALL": { "Average Score": 57.45780847204313, "Standard Deviation": 16.310023687014333, "Rank": 22 }, "Geometry": { "Average Score": 29.820233374501843, "Standard Deviation": null, "Rank": 36 }, "Algebra": { "Average Score": 39.873024674507214, "Standard Deviation": null, "Rank": 41 }, "Probability": { "Average Score": 31.85692359301203, "Standard Deviation": null, "Rank": 40 }, "Logical": { "Average Score": 43.93437465788311, "Standard Deviation": null, "Rank": 30 }, "Social": { "Average Score": 44.689420554662476, "Standard Deviation": null, "Rank": 27 }, "Chemistry": { "Average Score": 32.05704364512495, "Standard Deviation": null, "Rank": 40 }, "CPP": { "Average Score": 30.53406933106768, "Standard Deviation": null, "Rank": 36 } } }, { "config": { "model_name": "starling-lm-7b-alpha", "organization": "Nexusflow", "license": "Apache-2.0", "knowledge_cutoff": "2023/11" }, "results": { "OVERALL": { "Average Score": 40.625443347641045, "Standard Deviation": 3.0544259540377268, "Rank": 34 }, "Geometry": { "Average Score": 26.171147508308422, "Standard Deviation": null, "Rank": 39 }, "Algebra": { "Average Score": 39.149463007523856, "Standard Deviation": null, "Rank": 42 }, "Probability": { "Average Score": 32.36862021879827, "Standard Deviation": null, "Rank": 39 }, "Logical": { "Average Score": 34.17344938419256, "Standard Deviation": null, "Rank": 39 }, "Social": { "Average Score": 35.06966333212518, "Standard Deviation": null, "Rank": 35 }, "Chemistry": { "Average Score": 32.15932739848045, "Standard Deviation": null, "Rank": 39 }, "CPP": { "Average Score": 30.07926487356878, "Standard Deviation": null, "Rank": 37 } } }, { "config": { "model_name": "qwen1.5-4b-chat", "organization": "Alibaba", "license": "Qianwen LICENSE", "knowledge_cutoff": "2024/02" }, "results": { "OVERALL": { "Average Score": 11.723779019126527, "Standard Deviation": 0.856230353584155, "Rank": 53 }, "Geometry": { "Average Score": 16.072772563608115, "Standard Deviation": null, "Rank": 45 }, "Algebra": { "Average Score": 32.22626131587612, "Standard Deviation": null, "Rank": 44 }, "Probability": { "Average Score": 13.98282712349133, "Standard Deviation": null, "Rank": 48 }, "Logical": { "Average Score": 13.993097991375581, "Standard Deviation": null, "Rank": 51 }, "Social": { "Average Score": 22.955898106386442, "Standard Deviation": null, "Rank": 45 }, "Chemistry": { "Average Score": 13.907481529463642, "Standard Deviation": null, "Rank": 51 }, "CPP": { "Average Score": 13.21208067122554, "Standard Deviation": null, "Rank": 47 } } }, { "config": { "model_name": "command-r-(04-2024)", "organization": "Cohere", "license": "CC-BY-NC-4.0", "knowledge_cutoff": "2024/04" }, "results": { "OVERALL": { "Average Score": 43.08187135994592, "Standard Deviation": 0.7654553730614279, "Rank": 33 }, "Geometry": { "Average Score": 24.037084801508428, "Standard Deviation": null, "Rank": 41 }, "Algebra": { "Average Score": 32.37474440275246, "Standard Deviation": null, "Rank": 43 }, "Probability": { "Average Score": 31.014039425232298, "Standard Deviation": null, "Rank": 41 }, "Logical": { "Average Score": 35.49507014348235, "Standard Deviation": null, "Rank": 37 }, "Social": { "Average Score": 34.782695172510856, "Standard Deviation": null, "Rank": 37 }, "Chemistry": { "Average Score": 42.46395478814961, "Standard Deviation": null, "Rank": 32 }, "CPP": { "Average Score": 41.346336503003236, "Standard Deviation": null, "Rank": 28 } } }, { "config": { "model_name": "vicuna-33b", "organization": "LMSYS", "license": "Non-commercial", "knowledge_cutoff": "2023/08" }, "results": { "OVERALL": { "Average Score": 30.8582386682731, "Standard Deviation": 2.3851186735858945, "Rank": 42 }, "Geometry": { "Average Score": 17.058968577112452, "Standard Deviation": null, "Rank": 44 }, "Algebra": { "Average Score": 25.22004544023738, "Standard Deviation": null, "Rank": 45 }, "Probability": { "Average Score": 21.097169680647767, "Standard Deviation": null, "Rank": 46 }, "Logical": { "Average Score": 23.212667585279515, "Standard Deviation": null, "Rank": 45 }, "Social": { "Average Score": 32.357116321848025, "Standard Deviation": null, "Rank": 38 }, "Chemistry": { "Average Score": 29.376389899632898, "Standard Deviation": null, "Rank": 42 }, "CPP": { "Average Score": 28.01838653090379, "Standard Deviation": null, "Rank": 38 } } }, { "config": { "model_name": "gemma-7b-it", "organization": "Google", "license": "Gemma License", "knowledge_cutoff": "2024/02" }, "results": { "OVERALL": { "Average Score": 27.609692676933715, "Standard Deviation": 5.8350892031427435, "Rank": 45 }, "Geometry": { "Average Score": 20.127802528542947, "Standard Deviation": null, "Rank": 42 }, "Algebra": { "Average Score": 23.46400816161807, "Standard Deviation": null, "Rank": 47 }, "Probability": { "Average Score": 17.139514453170445, "Standard Deviation": null, "Rank": 47 }, "Logical": { "Average Score": 24.625290351028372, "Standard Deviation": null, "Rank": 42 }, "Social": { "Average Score": 26.715025606557614, "Standard Deviation": null, "Rank": 42 }, "Chemistry": { "Average Score": 29.383105099269972, "Standard Deviation": null, "Rank": 41 }, "CPP": { "Average Score": 28.014658234926813, "Standard Deviation": null, "Rank": 39 } } }, { "config": { "model_name": "mistral-7b-instruct-2", "organization": "Mistral", "license": "Apache 2.0", "knowledge_cutoff": "2023/12" }, "results": { "OVERALL": { "Average Score": 32.583755237895794, "Standard Deviation": 1.6860156811686553, "Rank": 40 }, "Geometry": { "Average Score": 17.27716649229315, "Standard Deviation": null, "Rank": 43 }, "Algebra": { "Average Score": 23.58916877939791, "Standard Deviation": null, "Rank": 46 }, "Probability": { "Average Score": 25.1012270940144, "Standard Deviation": null, "Rank": 44 }, "Logical": { "Average Score": 29.07002036532878, "Standard Deviation": null, "Rank": 40 }, "Social": { "Average Score": 24.39006275978174, "Standard Deviation": null, "Rank": 43 }, "Chemistry": { "Average Score": 32.76096708662236, "Standard Deviation": null, "Rank": 38 }, "CPP": { "Average Score": 31.382959631870822, "Standard Deviation": null, "Rank": 35 } } }, { "config": { "model_name": "mistral-7b-instruct-1", "organization": "Mistral", "license": "Apache 2.0", "knowledge_cutoff": "2023/12" }, "results": { "OVERALL": { "Average Score": 22.167930858422395, "Standard Deviation": 3.328543828571604, "Rank": 50 }, "Geometry": { "Average Score": 11.300762460776488, "Standard Deviation": null, "Rank": 49 }, "Algebra": { "Average Score": 21.016466430115493, "Standard Deviation": null, "Rank": 48 }, "Probability": { "Average Score": 24.506863192031716, "Standard Deviation": null, "Rank": 45 }, "Logical": { "Average Score": 17.0066100312336, "Standard Deviation": null, "Rank": 49 }, "Social": { "Average Score": 14.049392081101905, "Standard Deviation": null, "Rank": 52 }, "Chemistry": { "Average Score": 20.796521445473058, "Standard Deviation": null, "Rank": 45 }, "CPP": { "Average Score": 18.929093202755805, "Standard Deviation": null, "Rank": 42 } } }, { "config": { "model_name": "vicuna-13b", "organization": "LMSYS", "license": "Non-commercial", "knowledge_cutoff": "2023/07" }, "results": { "OVERALL": { "Average Score": 20.105123059326157, "Standard Deviation": 4.100609090750239, "Rank": 51 }, "Geometry": { "Average Score": 13.080654946737525, "Standard Deviation": null, "Rank": 48 }, "Algebra": { "Average Score": 20.125194674408167, "Standard Deviation": null, "Rank": 49 }, "Probability": { "Average Score": 13.125942598704368, "Standard Deviation": null, "Rank": 49 }, "Logical": { "Average Score": 17.182300978389822, "Standard Deviation": null, "Rank": 48 }, "Social": { "Average Score": 16.258399348520832, "Standard Deviation": null, "Rank": 50 }, "Chemistry": { "Average Score": 23.79065696739089, "Standard Deviation": null, "Rank": 44 }, "CPP": { "Average Score": 21.840013221590294, "Standard Deviation": null, "Rank": 40 } } }, { "config": { "model_name": "zephyr-7b-beta", "organization": "HuggingFace", "license": "MIT", "knowledge_cutoff": "2023/10" }, "results": { "OVERALL": { "Average Score": 11.581258432641418, "Standard Deviation": 1.677081510212375, "Rank": 54 }, "Geometry": { "Average Score": 8.432624521698594, "Standard Deviation": null, "Rank": 50 }, "Algebra": { "Average Score": 12.912859660357217, "Standard Deviation": null, "Rank": 51 }, "Probability": { "Average Score": 7.643552619113196, "Standard Deviation": null, "Rank": 54 }, "Logical": { "Average Score": 7.444095116649809, "Standard Deviation": null, "Rank": 55 }, "Social": { "Average Score": 0.0, "Standard Deviation": null, "Rank": 57 }, "Chemistry": { "Average Score": 16.150157007299235, "Standard Deviation": null, "Rank": 49 }, "CPP": { "Average Score": 18.92902220864132, "Standard Deviation": null, "Rank": 43 } } }, { "config": { "model_name": "gemma-1.1-2b-it", "organization": "Google", "license": "Gemma License", "knowledge_cutoff": "2024/02" }, "results": { "OVERALL": { "Average Score": 25.06653151900311, "Standard Deviation": 5.340973431345662, "Rank": 48 }, "Geometry": { "Average Score": 13.161686218568628, "Standard Deviation": null, "Rank": 47 }, "Algebra": { "Average Score": 15.592205919293873, "Standard Deviation": null, "Rank": 50 }, "Probability": { "Average Score": 8.305764696120711, "Standard Deviation": null, "Rank": 51 }, "Logical": { "Average Score": 10.940766703849592, "Standard Deviation": null, "Rank": 53 }, "Social": { "Average Score": 21.925546766366356, "Standard Deviation": null, "Rank": 46 }, "Chemistry": { "Average Score": 18.700936936742952, "Standard Deviation": null, "Rank": 46 }, "CPP": { "Average Score": 20.724691953843916, "Standard Deviation": null, "Rank": 41 } } }, { "config": { "model_name": "llama2-7b-chat", "organization": "Meta", "license": "Llama 2 Community", "knowledge_cutoff": "2023/07" }, "results": { "OVERALL": { "Average Score": 25.633612357313762, "Standard Deviation": 2.805639153654191, "Rank": 46 }, "Geometry": { "Average Score": 5.825877827672446, "Standard Deviation": null, "Rank": 51 }, "Algebra": { "Average Score": 8.58657284915635, "Standard Deviation": null, "Rank": 53 }, "Probability": { "Average Score": 8.164826137672431, "Standard Deviation": null, "Rank": 53 }, "Logical": { "Average Score": 20.697630462723275, "Standard Deviation": null, "Rank": 47 }, "Social": { "Average Score": 18.13821609304045, "Standard Deviation": null, "Rank": 47 }, "Chemistry": { "Average Score": 17.065363968846427, "Standard Deviation": null, "Rank": 47 }, "CPP": { "Average Score": 15.730513733660898, "Standard Deviation": null, "Rank": 45 } } }, { "config": { "model_name": "gemma-2b-it", "organization": "Google", "license": "Gemma License", "knowledge_cutoff": "2024/02" }, "results": { "OVERALL": { "Average Score": 22.935122315202772, "Standard Deviation": 1.9451357494738446, "Rank": 49 }, "Geometry": { "Average Score": 15.523844579555126, "Standard Deviation": null, "Rank": 46 }, "Algebra": { "Average Score": 8.997563653883809, "Standard Deviation": null, "Rank": 52 }, "Probability": { "Average Score": 6.750305898269558, "Standard Deviation": null, "Rank": 55 }, "Logical": { "Average Score": 5.354222904092569, "Standard Deviation": null, "Rank": 56 }, "Social": { "Average Score": 10.938132042877358, "Standard Deviation": null, "Rank": 54 }, "Chemistry": { "Average Score": 17.06532733699507, "Standard Deviation": null, "Rank": 47 }, "CPP": { "Average Score": 17.2715657115764, "Standard Deviation": null, "Rank": 44 } } }, { "config": { "model_name": "llama2-13b-chat", "organization": "Meta", "license": "Llama 2 Community", "knowledge_cutoff": "2023/07" }, "results": { "OVERALL": { "Average Score": 25.828530292775856, "Standard Deviation": 3.2503558704879296, "Rank": 47 }, "Geometry": { "Average Score": 4.119943280135397, "Standard Deviation": null, "Rank": 53 }, "Algebra": { "Average Score": 6.355347828676415, "Standard Deviation": null, "Rank": 54 }, "Probability": { "Average Score": 11.5585998384148, "Standard Deviation": null, "Rank": 50 }, "Logical": { "Average Score": 24.172674067890938, "Standard Deviation": null, "Rank": 43 }, "Social": { "Average Score": 17.850287642446094, "Standard Deviation": null, "Rank": 49 }, "Chemistry": { "Average Score": 13.887442704655687, "Standard Deviation": null, "Rank": 52 }, "CPP": { "Average Score": 13.17258252933903, "Standard Deviation": null, "Rank": 48 } } }, { "config": { "model_name": "vicuna-7b", "organization": "LMSYS", "license": "Non-commercial", "knowledge_cutoff": "2023/07" }, "results": { "OVERALL": { "Average Score": 19.78471384913738, "Standard Deviation": 3.7936645273402276, "Rank": 52 }, "Geometry": { "Average Score": 5.434763675792798, "Standard Deviation": null, "Rank": 52 }, "Algebra": { "Average Score": 5.925959137419872, "Standard Deviation": null, "Rank": 55 }, "Probability": { "Average Score": 8.30566475354697, "Standard Deviation": null, "Rank": 51 }, "Logical": { "Average Score": 11.881223740003346, "Standard Deviation": null, "Rank": 52 }, "Social": { "Average Score": 12.864677350128595, "Standard Deviation": null, "Rank": 53 }, "Chemistry": { "Average Score": 14.187574975522333, "Standard Deviation": null, "Rank": 50 }, "CPP": { "Average Score": 14.255194156624162, "Standard Deviation": null, "Rank": 46 } } }, { "config": { "model_name": "koala-13b", "organization": "UC Berkeley", "license": "Non-commercial", "knowledge_cutoff": "2023/04" }, "results": { "OVERALL": { "Average Score": 10.216910767982592, "Standard Deviation": 2.0597606260293655, "Rank": 55 }, "Geometry": { "Average Score": 0.1600118163292883, "Standard Deviation": null, "Rank": 54 }, "Algebra": { "Average Score": 2.2219841274068948, "Standard Deviation": null, "Rank": 56 }, "Probability": { "Average Score": 3.353938470588142, "Standard Deviation": null, "Rank": 56 }, "Logical": { "Average Score": 8.24436273551765, "Standard Deviation": null, "Rank": 54 }, "Social": { "Average Score": 10.96000067573448, "Standard Deviation": null, "Rank": 54 }, "Chemistry": { "Average Score": 6.272570799004611, "Standard Deviation": null, "Rank": 53 }, "CPP": { "Average Score": 6.36433272373514, "Standard Deviation": null, "Rank": 49 } } }, { "config": { "model_name": "openassistant-pythia-12b", "organization": "OpenAssistant", "license": "Non-commercial", "knowledge_cutoff": "2023/04" }, "results": { "OVERALL": { "Average Score": 0.0, "Standard Deviation": 0.0, "Rank": 56 }, "Geometry": { "Average Score": 0.0, "Standard Deviation": null, "Rank": 55 }, "Algebra": { "Average Score": 0.0, "Standard Deviation": null, "Rank": 57 }, "Probability": { "Average Score": 0.0, "Standard Deviation": null, "Rank": 57 }, "Logical": { "Average Score": 0.0, "Standard Deviation": null, "Rank": 57 }, "Social": { "Average Score": 1.859688217710296, "Standard Deviation": null, "Rank": 56 }, "Chemistry": { "Average Score": 0.0, "Standard Deviation": null, "Rank": 54 }, "CPP": { "Average Score": 0.0, "Standard Deviation": null, "Rank": 50 } } }, { "config": { "model_name": "nemotron-70b", "organization": "NVIDIA", "license": "Unknown", "knowledge_cutoff": "Unknown" }, "results": { "OVERALL": { "Average Score": 100.0, "Standard Deviation": 0.0, "Rank": 1 }, "Geometry": { "Average Score": 68.72757963233221, "Standard Deviation": null, "Rank": 12 }, "Algebra": { "Average Score": 73.71625129267943, "Standard Deviation": null, "Rank": 16 }, "Chemistry": { "Average Score": 72.48678626772566, "Standard Deviation": null, "Rank": 14 }, "Logical": { "Average Score": 92.57864400540329, "Standard Deviation": null, "Rank": 5 }, "Social": { "Average Score": 99.63342284899149, "Standard Deviation": null, "Rank": 2 }, "Probability": { "Average Score": 75.30735899300154, "Standard Deviation": null, "Rank": 10 } } }, { "config": { "model_name": "llama-3.2-3b-it", "organization": "Meta", "license": "Llama 3 Community", "knowledge_cutoff": "Unknown" }, "results": { "OVERALL": { "Average Score": 29.47099904114387, "Standard Deviation": 1.6836027650802912, "Rank": 43 }, "Geometry": { "Average Score": 0.0, "Standard Deviation": 0.0, "Rank": 50 }, "Algebra": { "Average Score": 55.31592410564261, "Standard Deviation": null, "Rank": 29 }, "Chemistry": { "Average Score": 28.667640602193643, "Standard Deviation": null, "Rank": 43 }, "Logical": { "Average Score": 15.35430947415723, "Standard Deviation": null, "Rank": 49 }, "Social": { "Average Score": 18.087938295545133, "Standard Deviation": null, "Rank": 48 }, "Probability": { "Average Score": 37.84631410688676, "Standard Deviation": null, "Rank": 35 } } }, { "config": { "model_name": "yi-lightning", "organization": "01 AI", "license": "Proprietary", "knowledge_cutoff": "Unknown" }, "results": { "OVERALL": { "Average Score": 96.10303362688546, "Standard Deviation": 0.5365246195716372, "Rank": 3 }, "Geometry": { "Average Score": 77.09570683128703, "Standard Deviation": null, "Rank": 8 }, "Algebra": { "Average Score": 85.92132293392635, "Standard Deviation": null, "Rank": 6 }, "Chemistry": { "Average Score": 95.7205664118507, "Standard Deviation": null, "Rank": 2 }, "Logical": { "Average Score": 94.60171867702756, "Standard Deviation": null, "Rank": 4 }, "Social": { "Average Score": 93.93680225135506, "Standard Deviation": null, "Rank": 6 }, "Probability": { "Average Score": 90.23858748317501, "Standard Deviation": null, "Rank": 3 } } }, { "config": { "model_name": "glm-4-plus", "organization": "Zhipu AI", "license": "Proprietary", "knowledge_cutoff": "Unknown" }, "results": { "OVERALL": { "Average Score": 90.50303579501356, "Standard Deviation": 5.202472970969946, "Rank": 6 }, "Geometry": { "Average Score": 76.37543021571776, "Standard Deviation": null, "Rank": 9 }, "Algebra": { "Average Score": 81.39859078752944, "Standard Deviation": null, "Rank": 10 }, "Chemistry": { "Average Score": 90.15506569759444, "Standard Deviation": null, "Rank": 6 }, "Logical": { "Average Score": 92.26403821208403, "Standard Deviation": null, "Rank": 6 }, "Social": { "Average Score": 100.0, "Standard Deviation": null, "Rank": 1 }, "Probability": { "Average Score": 73.99418447190348, "Standard Deviation": null, "Rank": 11 } } } ]