still in training. Trained on about ~21 billion tokens so far.
Tasks |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
Open LLM Leaderboard |
N/A |
|
|
|
|
|
|
|
- arc_challenge |
1 |
none |
25 |
acc |
↑ |
0.2005 |
± |
0.0117 |
|
|
none |
25 |
acc_norm |
↑ |
0.2406 |
± |
0.0125 |
- gsm8k |
3 |
flexible-extract |
5 |
exact_match |
↑ |
0.0083 |
± |
0.0025 |
|
|
strict-match |
5 |
exact_match |
↑ |
0.0000 |
± |
0.0000 |
- hellaswag |
1 |
none |
10 |
acc |
↑ |
0.2724 |
± |
0.0044 |
|
|
none |
10 |
acc_norm |
↑ |
0.2838 |
± |
0.0045 |
- mmlu |
2 |
none |
|
acc |
↑ |
0.2290 |
± |
0.0035 |
- humanities |
2 |
none |
|
acc |
↑ |
0.2380 |
± |
0.0062 |
- formal_logic |
1 |
none |
5 |
acc |
↑ |
0.2460 |
± |
0.0385 |
- high_school_european_history |
1 |
none |
5 |
acc |
↑ |
0.1818 |
± |
0.0301 |
- high_school_us_history |
1 |
none |
5 |
acc |
↑ |
0.2647 |
± |
0.0310 |
- high_school_world_history |
1 |
none |
5 |
acc |
↑ |
0.2911 |
± |
0.0296 |
- international_law |
1 |
none |
5 |
acc |
↑ |
0.2149 |
± |
0.0375 |
- jurisprudence |
1 |
none |
5 |
acc |
↑ |
0.2685 |
± |
0.0428 |
- logical_fallacies |
1 |
none |
5 |
acc |
↑ |
0.2209 |
± |
0.0326 |
- moral_disputes |
1 |
none |
5 |
acc |
↑ |
0.2457 |
± |
0.0232 |
- moral_scenarios |
1 |
none |
5 |
acc |
↑ |
0.2369 |
± |
0.0142 |
- philosophy |
1 |
none |
5 |
acc |
↑ |
0.1865 |
± |
0.0221 |
- prehistory |
1 |
none |
5 |
acc |
↑ |
0.1975 |
± |
0.0222 |
- professional_law |
1 |
none |
5 |
acc |
↑ |
0.2432 |
± |
0.0110 |
- world_religions |
1 |
none |
5 |
acc |
↑ |
0.3099 |
± |
0.0355 |
- other |
2 |
none |
|
acc |
↑ |
0.2375 |
± |
0.0076 |
- business_ethics |
1 |
none |
5 |
acc |
↑ |
0.3200 |
± |
0.0469 |
- clinical_knowledge |
1 |
none |
5 |
acc |
↑ |
0.2226 |
± |
0.0256 |
- college_medicine |
1 |
none |
5 |
acc |
↑ |
0.1965 |
± |
0.0303 |
- global_facts |
1 |
none |
5 |
acc |
↑ |
0.1800 |
± |
0.0386 |
- human_aging |
1 |
none |
5 |
acc |
↑ |
0.3004 |
± |
0.0308 |
- management |
1 |
none |
5 |
acc |
↑ |
0.1942 |
± |
0.0392 |
- marketing |
1 |
none |
5 |
acc |
↑ |
0.2735 |
± |
0.0292 |
- medical_genetics |
1 |
none |
5 |
acc |
↑ |
0.3000 |
± |
0.0461 |
- miscellaneous |
1 |
none |
5 |
acc |
↑ |
0.2478 |
± |
0.0154 |
- nutrition |
1 |
none |
5 |
acc |
↑ |
0.2222 |
± |
0.0238 |
- professional_accounting |
1 |
none |
5 |
acc |
↑ |
0.2021 |
± |
0.0240 |
- professional_medicine |
1 |
none |
5 |
acc |
↑ |
0.1912 |
± |
0.0239 |
- virology |
1 |
none |
5 |
acc |
↑ |
0.2590 |
± |
0.0341 |
- social sciences |
2 |
none |
|
acc |
↑ |
0.2203 |
± |
0.0075 |
- econometrics |
1 |
none |
5 |
acc |
↑ |
0.2368 |
± |
0.0400 |
- high_school_geography |
1 |
none |
5 |
acc |
↑ |
0.2020 |
± |
0.0286 |
- high_school_government_and_politics |
1 |
none |
5 |
acc |
↑ |
0.1865 |
± |
0.0281 |
- high_school_macroeconomics |
1 |
none |
5 |
acc |
↑ |
0.2205 |
± |
0.0210 |
- high_school_microeconomics |
1 |
none |
5 |
acc |
↑ |
0.2143 |
± |
0.0267 |
- high_school_psychology |
1 |
none |
5 |
acc |
↑ |
0.1908 |
± |
0.0168 |
- human_sexuality |
1 |
none |
5 |
acc |
↑ |
0.2672 |
± |
0.0388 |
- professional_psychology |
1 |
none |
5 |
acc |
↑ |
0.2386 |
± |
0.0172 |
- public_relations |
1 |
none |
5 |
acc |
↑ |
0.1727 |
± |
0.0362 |
- security_studies |
1 |
none |
5 |
acc |
↑ |
0.2367 |
± |
0.0272 |
- sociology |
1 |
none |
5 |
acc |
↑ |
0.2488 |
± |
0.0306 |
- us_foreign_policy |
1 |
none |
5 |
acc |
↑ |
0.2600 |
± |
0.0441 |
- stem |
2 |
none |
|
acc |
↑ |
0.2157 |
± |
0.0073 |
- abstract_algebra |
1 |
none |
5 |
acc |
↑ |
0.2200 |
± |
0.0416 |
- anatomy |
1 |
none |
5 |
acc |
↑ |
0.1778 |
± |
0.0330 |
- astronomy |
1 |
none |
5 |
acc |
↑ |
0.1908 |
± |
0.0320 |
- college_biology |
1 |
none |
5 |
acc |
↑ |
0.2778 |
± |
0.0375 |
- college_chemistry |
1 |
none |
5 |
acc |
↑ |
0.2200 |
± |
0.0416 |
- college_computer_science |
1 |
none |
5 |
acc |
↑ |
0.2100 |
± |
0.0409 |
- college_mathematics |
1 |
none |
5 |
acc |
↑ |
0.2100 |
± |
0.0409 |
- college_physics |
1 |
none |
5 |
acc |
↑ |
0.2157 |
± |
0.0409 |
- computer_security |
1 |
none |
5 |
acc |
↑ |
0.2700 |
± |
0.0446 |
- conceptual_physics |
1 |
none |
5 |
acc |
↑ |
0.2638 |
± |
0.0288 |
- electrical_engineering |
1 |
none |
5 |
acc |
↑ |
0.2483 |
± |
0.0360 |
- elementary_mathematics |
1 |
none |
5 |
acc |
↑ |
0.2037 |
± |
0.0207 |
- high_school_biology |
1 |
none |
5 |
acc |
↑ |
0.1774 |
± |
0.0217 |
- high_school_chemistry |
1 |
none |
5 |
acc |
↑ |
0.2020 |
± |
0.0282 |
- high_school_computer_science |
1 |
none |
5 |
acc |
↑ |
0.2500 |
± |
0.0435 |
- high_school_mathematics |
1 |
none |
5 |
acc |
↑ |
0.2148 |
± |
0.0250 |
- high_school_physics |
1 |
none |
5 |
acc |
↑ |
0.2053 |
± |
0.0330 |
- high_school_statistics |
1 |
none |
5 |
acc |
↑ |
0.1481 |
± |
0.0242 |
- machine_learning |
1 |
none |
5 |
acc |
↑ |
0.3125 |
± |
0.0440 |
- truthfulqa_gen |
3 |
none |
0 |
bleu_acc |
↑ |
0.2362 |
± |
0.0149 |
|
|
none |
0 |
bleu_diff |
↑ |
-1.0138 |
± |
0.2569 |
|
|
none |
0 |
bleu_max |
↑ |
7.9522 |
± |
0.4088 |
|
|
none |
0 |
rouge1_acc |
↑ |
0.2595 |
± |
0.0153 |
|
|
none |
0 |
rouge1_diff |
↑ |
-1.9129 |
± |
0.4349 |
|
|
none |
0 |
rouge1_max |
↑ |
21.7885 |
± |
0.7307 |
|
|
none |
0 |
rouge2_acc |
↑ |
0.1200 |
± |
0.0114 |
|
|
none |
0 |
rouge2_diff |
↑ |
-1.9771 |
± |
0.3475 |
|
|
none |
0 |
rouge2_max |
↑ |
9.0199 |
± |
0.5842 |
|
|
none |
0 |
rougeL_acc |
↑ |
0.2570 |
± |
0.0153 |
|
|
none |
0 |
rougeL_diff |
↑ |
-1.8812 |
± |
0.4185 |
|
|
none |
0 |
rougeL_max |
↑ |
19.6284 |
± |
0.6850 |
- truthfulqa_mc1 |
2 |
none |
0 |
acc |
↑ |
0.1983 |
± |
0.0140 |
- truthfulqa_mc2 |
2 |
none |
0 |
acc |
↑ |
0.3861 |
± |
0.0147 |
- winogrande |
1 |
none |
5 |
acc |
↑ |
0.4972 |
± |
0.0141 |
Groups |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
- mmlu |
2 |
none |
|
acc |
↑ |
0.2290 |
± |
0.0035 |
- humanities |
2 |
none |
|
acc |
↑ |
0.2380 |
± |
0.0062 |
- other |
2 |
none |
|
acc |
↑ |
0.2375 |
± |
0.0076 |
- social sciences |
2 |
none |
|
acc |
↑ |
0.2203 |
± |
0.0075 |
- stem |
2 |
none |
|
acc |
↑ |
0.2157 |
± |
0.0073 |
Tasks |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
agieval_nous |
0 |
none |
|
acc_norm |
↑ |
0.2133 |
± |
0.0081 |
- agieval_aqua_rat |
1 |
none |
0 |
acc |
↑ |
0.2047 |
± |
0.0254 |
|
|
none |
0 |
acc_norm |
↑ |
0.1969 |
± |
0.0250 |
- agieval_logiqa_en |
1 |
none |
0 |
acc |
↑ |
0.2043 |
± |
0.0158 |
|
|
none |
0 |
acc_norm |
↑ |
0.2304 |
± |
0.0165 |
- agieval_lsat_ar |
1 |
none |
0 |
acc |
↑ |
0.1739 |
± |
0.0250 |
|
|
none |
0 |
acc_norm |
↑ |
0.1957 |
± |
0.0262 |
- agieval_lsat_lr |
1 |
none |
0 |
acc |
↑ |
0.1549 |
± |
0.0160 |
|
|
none |
0 |
acc_norm |
↑ |
0.1608 |
± |
0.0163 |
- agieval_lsat_rc |
1 |
none |
0 |
acc |
↑ |
0.1636 |
± |
0.0226 |
|
|
none |
0 |
acc_norm |
↑ |
0.2119 |
± |
0.0250 |
- agieval_sat_en |
1 |
none |
0 |
acc |
↑ |
0.2670 |
± |
0.0309 |
|
|
none |
0 |
acc_norm |
↑ |
0.2621 |
± |
0.0307 |
- agieval_sat_en_without_passage |
1 |
none |
0 |
acc |
↑ |
0.2670 |
± |
0.0309 |
|
|
none |
0 |
acc_norm |
↑ |
0.2621 |
± |
0.0307 |
- agieval_sat_math |
1 |
none |
0 |
acc |
↑ |
0.2182 |
± |
0.0279 |
|
|
none |
0 |
acc_norm |
↑ |
0.2318 |
± |
0.0285 |
arc_challenge |
1 |
none |
0 |
acc |
↑ |
0.1945 |
± |
0.0116 |
|
|
none |
0 |
acc_norm |
↑ |
0.2372 |
± |
0.0124 |
truthfulqa_mc2 |
2 |
none |
0 |
acc |
↑ |
0.3861 |
± |
0.0147 |
Groups |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
agieval_nous |
0 |
none |
|
acc_norm |
↑ |
0.2133 |
± |
0.0081 |
Open LLM Leaderboard Evaluation Results
Detailed results can be found here
Metric |
Value |
Avg. |
4.13 |
IFEval (0-Shot) |
16.39 |
BBH (3-Shot) |
1.78 |
MATH Lvl 5 (4-Shot) |
0.00 |
GPQA (0-shot) |
0.00 |
MuSR (0-shot) |
5.15 |
MMLU-PRO (5-shot) |
1.47 |