Update app.py
Browse files
app.py
CHANGED
@@ -26,7 +26,7 @@ def make_leaderboard_md(elo_results):
|
|
26 |
- [MT-Bench](https://arxiv.org/abs/2306.05685) - a set of challenging multi-turn questions. We use GPT-4 to grade the model responses.
|
27 |
- [MMLU](https://arxiv.org/abs/2009.03300) (5-shot) - a test to measure a model's multitask accuracy on 57 tasks.
|
28 |
|
29 |
-
π» Code: The Arena Elo ratings are computed by this [notebook]({notebook_url}). The MT-bench scores (single-answer grading on a scale of 10) are computed by [fastchat.llm_judge](https://github.com/lm-sys/FastChat/tree/main/fastchat/llm_judge). The MMLU scores are mostly computed by [InstructEval](https://github.com/declare-lab/instruct-eval). Higher values are better for all benchmarks. Empty cells mean not available. Last updated: Dec
|
30 |
"""
|
31 |
return leaderboard_md
|
32 |
|
@@ -105,7 +105,7 @@ def load_leaderboard_table_csv(filename, add_hyperlink=True):
|
|
105 |
for j in range(len(heads)):
|
106 |
item = {}
|
107 |
for h, v in zip(heads, row):
|
108 |
-
if h == "Arena Elo
|
109 |
if v != "-":
|
110 |
v = int(ast.literal_eval(v))
|
111 |
else:
|
|
|
26 |
- [MT-Bench](https://arxiv.org/abs/2306.05685) - a set of challenging multi-turn questions. We use GPT-4 to grade the model responses.
|
27 |
- [MMLU](https://arxiv.org/abs/2009.03300) (5-shot) - a test to measure a model's multitask accuracy on 57 tasks.
|
28 |
|
29 |
+
π» Code: The Arena Elo ratings are computed by this [notebook]({notebook_url}). The MT-bench scores (single-answer grading on a scale of 10) are computed by [fastchat.llm_judge](https://github.com/lm-sys/FastChat/tree/main/fastchat/llm_judge). The MMLU scores are mostly computed by [InstructEval](https://github.com/declare-lab/instruct-eval). Higher values are better for all benchmarks. Empty cells mean not available. Last updated: Dec 20, 2023.
|
30 |
"""
|
31 |
return leaderboard_md
|
32 |
|
|
|
105 |
for j in range(len(heads)):
|
106 |
item = {}
|
107 |
for h, v in zip(heads, row):
|
108 |
+
if h == "Arena Elo":
|
109 |
if v != "-":
|
110 |
v = int(ast.literal_eval(v))
|
111 |
else:
|