yuchenlin commited on
Commit
1c919b3
1 Parent(s): 27b04a7

inti commit

Browse files
.gitignore ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+
2
+ *.pyc
3
+ ZeroEval-main/.DS_Store
4
+ ZeroEval-main/result_dirs/.DS_Store
5
+ ZeroEval-main/result_dirs/zebra-grid/.DS_Store
6
+ .DS_Store
README.md CHANGED
@@ -1,13 +1,60 @@
1
  ---
2
- title: ZebraLogicBench Leaderboard
3
- emoji: 📉
4
  colorFrom: blue
5
- colorTo: red
6
  sdk: gradio
7
- sdk_version: 4.37.2
8
  app_file: app.py
9
- pinned: false
10
- license: apache-2.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
1
  ---
2
+ title: Zebra Logic Bench
3
+ emoji: 🦓
4
  colorFrom: blue
5
+ colorTo: yellow
6
  sdk: gradio
7
+ sdk_version: 4.19.2
8
  app_file: app.py
9
+ pinned: true
10
+ fullWidth: true
11
+ hf_oauth: true
12
+ api: false
13
+ tags:
14
+ - leaderboard
15
+ datasets:
16
+ - allenai/ZebraLogicBench
17
+ - allenai/ZebraLogicBench-private
18
+ models:
19
+ - Qwen/Qwen2-72B-Instruct
20
+ - Qwen/Qwen1.5-72B-Chat
21
+ - Qwen/Qwen1.5-7B-Chat
22
+ - meta-llama/Meta-Llama-3-8B-Instruct
23
+ - meta-llama/Meta-Llama-3-70B-Instruct
24
+ - meta-llama/Llama-2-13b-chat-hf
25
+ - meta-llama/Llama-2-70b-chat-hf
26
+ - meta-llama/Llama-2-7b-chat-hf
27
+ - mistralai/Mistral-7B-Instruct-v0.1
28
+ - mistralai/Mistral-7B-Instruct-v0.2
29
+ - mistralai/Mixtral-8x7B-Instruct-v0.1
30
+ - microsoft/Phi-3-medium-128k-instruct
31
+ - microsoft/Phi-3-mini-128k-instruct
32
+ - NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO
33
+ - NousResearch/Hermes-2-Theta-Llama-3-8B
34
+ - 01-ai/Yi-1.5-34B-Chat
35
+ - 01-ai/Yi-1.5-9B-Chat
36
+ - 01-ai/Yi-1.5-6B-Chat
37
+ - google/gemma-7b-it
38
+ - google/gemma-2b-it
39
+ - allenai/tulu-2-dpo-70b
40
+ - HuggingFaceH4/zephyr-7b-beta
41
+ - Nexusflow/Starling-LM-7B-beta
42
+ - databricks/dbrx-instruct
43
+ - princeton-nlp/Llama-3-Instruct-8B-SimPO
44
+ - chujiezheng/Llama-3-Instruct-8B-SimPO-ExPO
45
+ - chujiezheng/Starling-LM-7B-beta-ExPO
46
+ - ZhangShenao/SELM-Zephyr-7B-iter-3
47
+ - deepseek-ai/DeepSeek-V2-Chat
48
+ - m-a-p/neo_7b_instruct_v0.1
49
+ - 01-ai/Yi-34B-chat
50
+ - lmsys/vicuna-13b-v1.5
51
+ - HuggingFaceH4/zephyr-7b-gemma-v0.1
52
+ - deepseek-ai/DeepSeek-Coder-V2
53
+ - THUDM/glm-4-9b-chat
54
+ - chujiezheng/neo_7b_instruct_v0.1-ExPO
55
+ - ZhangShenao/SELM-Llama-3-8B-Instruct-iter-3
56
  ---
57
 
58
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
59
+
60
+ Paper: arxiv.org/abs/2406.04770
ZeroEval-main/result_dirs/zebra-grid.summary.json ADDED
@@ -0,0 +1,321 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "Model": "claude-3-5-sonnet-20240620",
4
+ "Mode": "greedy",
5
+ "Puzzle Acc": "33.40",
6
+ "Cell Acc": "54.34",
7
+ "No answer": "0.00",
8
+ "Easy Puzzle Acc": "87.50",
9
+ "Hard Puzzle Acc": "12.36",
10
+ "Total Puzzles": 1000,
11
+ "Reason Lens": "1141.94"
12
+ },
13
+ {
14
+ "Model": "claude-3-5-sonnet-20240620",
15
+ "Mode": "sampling",
16
+ "Puzzle Acc": "33.40",
17
+ "Cell Acc": "53.01",
18
+ "No answer": "0.10",
19
+ "Easy Puzzle Acc": "88.21",
20
+ "Hard Puzzle Acc": "12.08",
21
+ "Total Puzzles": 1000,
22
+ "Reason Lens": "1153.83"
23
+ },
24
+ {
25
+ "Model": "gpt-4o-2024-05-13",
26
+ "Mode": "sampling",
27
+ "Puzzle Acc": "30.80",
28
+ "Cell Acc": "46.19",
29
+ "No answer": "6.60",
30
+ "Easy Puzzle Acc": "81.07",
31
+ "Hard Puzzle Acc": "11.25",
32
+ "Total Puzzles": 1000,
33
+ "Reason Lens": "1549.74"
34
+ },
35
+ {
36
+ "Model": "gpt-4-turbo-2024-04-09",
37
+ "Mode": "greedy",
38
+ "Puzzle Acc": "28.40",
39
+ "Cell Acc": "47.90",
40
+ "No answer": "0.10",
41
+ "Easy Puzzle Acc": "80.71",
42
+ "Hard Puzzle Acc": "8.06",
43
+ "Total Puzzles": 1000,
44
+ "Reason Lens": "1148.46"
45
+ },
46
+ {
47
+ "Model": "gpt-4o-2024-05-13",
48
+ "Mode": "greedy",
49
+ "Puzzle Acc": "28.20",
50
+ "Cell Acc": "38.72",
51
+ "No answer": "19.30",
52
+ "Easy Puzzle Acc": "77.86",
53
+ "Hard Puzzle Acc": "8.89",
54
+ "Total Puzzles": 1000,
55
+ "Reason Lens": "1643.51"
56
+ },
57
+ {
58
+ "Model": "gpt-4-0314",
59
+ "Mode": "greedy",
60
+ "Puzzle Acc": "27.10",
61
+ "Cell Acc": "47.43",
62
+ "No answer": "0.20",
63
+ "Easy Puzzle Acc": "77.14",
64
+ "Hard Puzzle Acc": "7.64",
65
+ "Total Puzzles": 1000,
66
+ "Reason Lens": "1203.17"
67
+ },
68
+ {
69
+ "Model": "claude-3-opus-20240229",
70
+ "Mode": "greedy",
71
+ "Puzzle Acc": "27.00",
72
+ "Cell Acc": "48.91",
73
+ "No answer": "0.00",
74
+ "Easy Puzzle Acc": "78.21",
75
+ "Hard Puzzle Acc": "7.08",
76
+ "Total Puzzles": 1000,
77
+ "Reason Lens": "855.72"
78
+ },
79
+ {
80
+ "Model": "gpt-4-turbo-2024-04-09",
81
+ "Mode": "sampling",
82
+ "Puzzle Acc": "26.40",
83
+ "Cell Acc": "47.93",
84
+ "No answer": "0.00",
85
+ "Easy Puzzle Acc": "74.29",
86
+ "Hard Puzzle Acc": "7.78",
87
+ "Total Puzzles": 1000,
88
+ "Reason Lens": "1165.90"
89
+ },
90
+ {
91
+ "Model": "deepseek-chat",
92
+ "Mode": "greedy",
93
+ "Puzzle Acc": "22.70",
94
+ "Cell Acc": "42.46",
95
+ "No answer": "5.20",
96
+ "Easy Puzzle Acc": "68.57",
97
+ "Hard Puzzle Acc": "4.86",
98
+ "Total Puzzles": 1000,
99
+ "Reason Lens": "1260.23"
100
+ },
101
+ {
102
+ "Model": "Qwen2-72B-Instruct",
103
+ "Mode": "greedy",
104
+ "Puzzle Acc": "21.40",
105
+ "Cell Acc": "38.32",
106
+ "No answer": "10.20",
107
+ "Easy Puzzle Acc": "63.93",
108
+ "Hard Puzzle Acc": "4.86",
109
+ "Total Puzzles": 1000,
110
+ "Reason Lens": "1813.82"
111
+ },
112
+ {
113
+ "Model": "deepseek-coder",
114
+ "Mode": "greedy",
115
+ "Puzzle Acc": "21.10",
116
+ "Cell Acc": "41.58",
117
+ "No answer": "4.90",
118
+ "Easy Puzzle Acc": "64.64",
119
+ "Hard Puzzle Acc": "4.17",
120
+ "Total Puzzles": 1000,
121
+ "Reason Lens": "1324.55"
122
+ },
123
+ {
124
+ "Model": "gemini-1.5-pro",
125
+ "Mode": "sampling",
126
+ "Puzzle Acc": "19.70",
127
+ "Cell Acc": "45.24",
128
+ "No answer": "0.40",
129
+ "Easy Puzzle Acc": "60.00",
130
+ "Hard Puzzle Acc": "4.03",
131
+ "Total Puzzles": 1000,
132
+ "Reason Lens": "1356.77"
133
+ },
134
+ {
135
+ "Model": "gemini-1.5-flash",
136
+ "Mode": "greedy",
137
+ "Puzzle Acc": "19.40",
138
+ "Cell Acc": "31.77",
139
+ "No answer": "22.70",
140
+ "Easy Puzzle Acc": "59.29",
141
+ "Hard Puzzle Acc": "3.89",
142
+ "Total Puzzles": 1000,
143
+ "Reason Lens": "1538.18"
144
+ },
145
+ {
146
+ "Model": "gemini-1.5-pro",
147
+ "Mode": "greedy",
148
+ "Puzzle Acc": "19.40",
149
+ "Cell Acc": "44.59",
150
+ "No answer": "0.80",
151
+ "Easy Puzzle Acc": "55.71",
152
+ "Hard Puzzle Acc": "5.28",
153
+ "Total Puzzles": 1000,
154
+ "Reason Lens": "1336.17"
155
+ },
156
+ {
157
+ "Model": "yi-large-preview",
158
+ "Mode": "greedy",
159
+ "Puzzle Acc": "18.90",
160
+ "Cell Acc": "42.61",
161
+ "No answer": "1.40",
162
+ "Easy Puzzle Acc": "58.93",
163
+ "Hard Puzzle Acc": "3.33",
164
+ "Total Puzzles": 1000,
165
+ "Reason Lens": "833.36"
166
+ },
167
+ {
168
+ "Model": "yi-large",
169
+ "Mode": "greedy",
170
+ "Puzzle Acc": "18.80",
171
+ "Cell Acc": "39.83",
172
+ "No answer": "1.80",
173
+ "Easy Puzzle Acc": "58.21",
174
+ "Hard Puzzle Acc": "3.47",
175
+ "Total Puzzles": 1000,
176
+ "Reason Lens": "757.01"
177
+ },
178
+ {
179
+ "Model": "claude-3-sonnet-20240229",
180
+ "Mode": "greedy",
181
+ "Puzzle Acc": "18.70",
182
+ "Cell Acc": "43.66",
183
+ "No answer": "0.00",
184
+ "Easy Puzzle Acc": "58.93",
185
+ "Hard Puzzle Acc": "3.06",
186
+ "Total Puzzles": 1000,
187
+ "Reason Lens": "1095.37"
188
+ },
189
+ {
190
+ "Model": "Qwen2-72B-Instruct",
191
+ "Mode": "sampling",
192
+ "Puzzle Acc": "18.70",
193
+ "Cell Acc": "40.57",
194
+ "No answer": "3.20",
195
+ "Easy Puzzle Acc": "57.50",
196
+ "Hard Puzzle Acc": "3.61",
197
+ "Total Puzzles": 1000,
198
+ "Reason Lens": "1894.72"
199
+ },
200
+ {
201
+ "Model": "gemini-1.5-flash",
202
+ "Mode": "sampling",
203
+ "Puzzle Acc": "18.40",
204
+ "Cell Acc": "36.03",
205
+ "No answer": "12.80",
206
+ "Easy Puzzle Acc": "57.86",
207
+ "Hard Puzzle Acc": "3.06",
208
+ "Total Puzzles": 1000,
209
+ "Reason Lens": "1713.03"
210
+ },
211
+ {
212
+ "Model": "Meta-Llama-3-70B-Instruct",
213
+ "Mode": "greedy",
214
+ "Puzzle Acc": "16.80",
215
+ "Cell Acc": "42.31",
216
+ "No answer": "0.20",
217
+ "Easy Puzzle Acc": "52.86",
218
+ "Hard Puzzle Acc": "2.78",
219
+ "Total Puzzles": 1000,
220
+ "Reason Lens": "809.95"
221
+ },
222
+ {
223
+ "Model": "gemma-2-27b-it@nvidia",
224
+ "Mode": "greedy",
225
+ "Puzzle Acc": "16.30",
226
+ "Cell Acc": "41.18",
227
+ "No answer": "1.10",
228
+ "Easy Puzzle Acc": "50.71",
229
+ "Hard Puzzle Acc": "2.92",
230
+ "Total Puzzles": 1000,
231
+ "Reason Lens": "1014.56"
232
+ },
233
+ {
234
+ "Model": "claude-3-haiku-20240307",
235
+ "Mode": "greedy",
236
+ "Puzzle Acc": "14.30",
237
+ "Cell Acc": "37.87",
238
+ "No answer": "0.10",
239
+ "Easy Puzzle Acc": "47.86",
240
+ "Hard Puzzle Acc": "1.25",
241
+ "Total Puzzles": 1000,
242
+ "Reason Lens": "1015.06"
243
+ },
244
+ {
245
+ "Model": "reka-core-20240501",
246
+ "Mode": "greedy",
247
+ "Puzzle Acc": "13.00",
248
+ "Cell Acc": "33.88",
249
+ "No answer": "4.00",
250
+ "Easy Puzzle Acc": "43.21",
251
+ "Hard Puzzle Acc": "1.25",
252
+ "Total Puzzles": 1000,
253
+ "Reason Lens": "1078.29"
254
+ },
255
+ {
256
+ "Model": "gemma-2-9b-it",
257
+ "Mode": "greedy",
258
+ "Puzzle Acc": "12.90",
259
+ "Cell Acc": "37.07",
260
+ "No answer": "0.50",
261
+ "Easy Puzzle Acc": "42.14",
262
+ "Hard Puzzle Acc": "1.53",
263
+ "Total Puzzles": 1000,
264
+ "Reason Lens": "859.14"
265
+ },
266
+ {
267
+ "Model": "gemma-2-9b-it@nvidia",
268
+ "Mode": "greedy",
269
+ "Puzzle Acc": "12.80",
270
+ "Cell Acc": "36.79",
271
+ "No answer": "0.00",
272
+ "Easy Puzzle Acc": "41.79",
273
+ "Hard Puzzle Acc": "1.53",
274
+ "Total Puzzles": 1000,
275
+ "Reason Lens": "849.84"
276
+ },
277
+ {
278
+ "Model": "Meta-Llama-3-8B-Instruct",
279
+ "Mode": "greedy",
280
+ "Puzzle Acc": "11.90",
281
+ "Cell Acc": "23.70",
282
+ "No answer": "29.20",
283
+ "Easy Puzzle Acc": "40.71",
284
+ "Hard Puzzle Acc": "0.69",
285
+ "Total Puzzles": 1000,
286
+ "Reason Lens": "1216.40"
287
+ },
288
+ {
289
+ "Model": "gpt-3.5-turbo-0125",
290
+ "Mode": "greedy",
291
+ "Puzzle Acc": "10.10",
292
+ "Cell Acc": "33.06",
293
+ "No answer": "0.10",
294
+ "Easy Puzzle Acc": "33.57",
295
+ "Hard Puzzle Acc": "0.97",
296
+ "Total Puzzles": 1000,
297
+ "Reason Lens": "820.66"
298
+ },
299
+ {
300
+ "Model": "reka-flash-20240226",
301
+ "Mode": "greedy",
302
+ "Puzzle Acc": "9.30",
303
+ "Cell Acc": "25.67",
304
+ "No answer": "18.70",
305
+ "Easy Puzzle Acc": "30.71",
306
+ "Hard Puzzle Acc": "0.97",
307
+ "Total Puzzles": 1000,
308
+ "Reason Lens": "1074.80"
309
+ },
310
+ {
311
+ "Model": "Qwen2-7B-Instruct",
312
+ "Mode": "greedy",
313
+ "Puzzle Acc": "8.40",
314
+ "Cell Acc": "22.06",
315
+ "No answer": "24.40",
316
+ "Easy Puzzle Acc": "29.29",
317
+ "Hard Puzzle Acc": "0.28",
318
+ "Total Puzzles": 1000,
319
+ "Reason Lens": "1473.23"
320
+ }
321
+ ]
__init__.py ADDED
File without changes
_about_us.md ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## About Us
2
+
3
+ ### Team
4
+
5
+ We are from [AllenAI](https://allenai.org/) (AI2), a non-profit research organization.
6
+
7
+ [Bill Yuchen Lin](https://yuchenlin.xyz/), [Ronan Le Bras](https://rlebras.github.io/), and [Yejin Choi](https://homes.cs.washington.edu/~yejin/).
8
+
9
+
10
+ ### Contact
11
+
12
+ Please contact us in the following ways:
13
+ - Github Issues/PRs for adding a new model: [https://github.com/allenai/WildBench](https://github.com/allenai/WildBench)
14
+ - HF Discussions for general questions about the leaderboard: [https://huggingface.co/spaces/allenai/WildBench/discussions](https://huggingface.co/spaces/allenai/WildBench/discussions)
15
+ - Other questions: Please contact Yuchen with email: yuchenl[at]allenai[dot]org
16
+
_header.md ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ <br/>
2
+
3
+ # 🦁 WildBench: Benchmarking LLMs with Challenging Tasks from Real Users in the Wild
4
+ [📑 Paper](https://allenai.github.io/WildBench/WildBench_paper.pdf) | [💻 GitHub](https://github.com/allenai/WildBench) | [🤗 HuggingFace](https://huggingface.co/collections/allenai/wildbench-65e8f2fa9c1260a85a933627) | [🐦 X](https://x.com/billyuchenlin/status/1795746137875554531) | [💬 Discussion](https://huggingface.co/spaces/allenai/WildBench/discussions) | ⚙️ **Version**: **V2** | **# Models**: {model_num} | Updated: **{LAST_UPDATED}**
5
+
_metrics.md ADDED
@@ -0,0 +1 @@
 
 
1
+ ##
app.py ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """A gradio app that renders a static leaderboard. This is used for Hugging Face Space."""
2
+ import ast
3
+ import argparse
4
+ import glob
5
+ import pickle
6
+ import plotly
7
+ import gradio as gr
8
+ import numpy as np
9
+ import pandas as pd
10
+ import gradio as gr
11
+ import pandas as pd
12
+ from pathlib import Path
13
+ import json
14
+ from constants import *
15
+ from datetime import datetime, timezone
16
+ # from datasets import Dataset, load_dataset, concatenate_datasets
17
+ import os, uuid
18
+ from utils_display import model_info
19
+ from constants import column_names, LEADERBOARD_REMARKS, DEFAULT_K, LEADERBOARD_REMARKS_MAIN
20
+ import pytz
21
+ from data_utils import post_processing
22
+
23
+ # get the last updated time from the elo_ranks.all.jsonl file
24
+ LAST_UPDATED = None
25
+ # with open("_intro.md", "r") as f:
26
+ # INTRO_MD = f.read()
27
+ INTRO_MD = ""
28
+ with open("_about_us.md", "r") as f:
29
+ ABOUT_MD = f.read()
30
+
31
+ with open("_header.md", "r") as f:
32
+ HEADER_MD = f.read()
33
+
34
+ with open("_metrics.md", "r") as f:
35
+ METRICS_MD = f.read()
36
+
37
+ original_df = None
38
+ # available_models = [] # to be filled in later
39
+ available_models = list(model_info.keys())
40
+
41
+ def _tab_leaderboard():
42
+ global original_df, available_models, gpt4t_dfs, haiku_dfs, llama_dfs, score_df
43
+ with gr.TabItem("📊 Main", elem_id="od-benchmark-tab-table-ablation", id=0, elem_classes="subtab"):
44
+ default_main_df = original_df.copy()
45
+ default_main_df.insert(0, "", range(1, 1 + len(default_main_df)))
46
+ default_main_df_no_task = default_main_df.copy()
47
+ # default_main_df_no_task = hide_task_column(default_main_df)
48
+ # default_main_df_no_task = rerank(default_main_df_no_task, rank_column=WB_ELO_COLUMN)
49
+ # default_main_df_no_task = rerank(default_main_df_no_task, rank_column=HYBRID_AVG_COLUMN)
50
+ with gr.Row():
51
+ # with gr.Column(scale=5):
52
+ # gr.Markdown(LEADERBOARD_REMARKS_MAIN, elem_classes="markdown-text-small top-left-LP")
53
+ # with gr.Row():
54
+ # with gr.Column(scale=2):
55
+ # md = gr.Markdown(" ### 👀 More presentation options ⬇️", elem_classes="markdown-text")
56
+
57
+ # with gr.Column(scale=3):
58
+ # with gr.Column(scale=2):
59
+ # gr.Markdown(f"""**__🪧 Default options:__** K={DEFAULT_K}; Hybrid-Macro; for best corr w/ LMSYS Elo.""", elem_classes="markdown-text")
60
+
61
+
62
+ # gr.Markdown(LENGTH_MARGIN_DESC_MD, elem_classes="markdown-text-tiny no_margin")
63
+ with gr.Column(scale=5):
64
+ with gr.Accordion("💬 Metric explanations", open=False, elem_classes="accordion-label"):
65
+ gr.Markdown(LEADERBOARD_REMARKS_MAIN, elem_classes="markdown-text-small no_margin")
66
+ rank_column_radio = gr.Radio(["🆚+💯 Hybrid", "🆚 Reward-Mix (Pairwise)", "💯 Score (Individual)", "🌟 WB Elo (beta)" ], show_label=False, elem_id="rank-column-radio",
67
+ value="🌟 WB Elo (beta)"
68
+ # value="🆚+💯 Hybrid"
69
+ )
70
+ with gr.Column(scale=2):
71
+ with gr.Row():
72
+ checkbox_show_task_categorized = gr.Checkbox(label="🆚 by Task Type", elem_id="show-task-categorized", value=False)
73
+ show_open_source_model_only = gr.Checkbox(label="🔑 Open Models", elem_id="show-open-source-models", value=False)
74
+ # with gr.Row():
75
+ # with gr.Column(scale=2):
76
+
77
+ leaderboard_table = gr.components.Dataframe(
78
+ value=default_main_df_no_task,
79
+ datatype= ["number", "markdown", "markdown", "number"],
80
+ # max_rows=None,
81
+ height=6000,
82
+ elem_id="leaderboard-table",
83
+ interactive=False,
84
+ visible=True,
85
+ column_widths=[50, 260,120, 120, 120, 130,100,100,110,100],
86
+ wrap=True
87
+ # min_width=60,
88
+ )
89
+ # checkbox_show_task_categorized.change(fn=length_margin_change, inputs=[length_margin_choices, gr.Text("main", visible=False), checkbox_show_task_categorized, show_open_source_model_only, rank_column_radio], outputs=[leaderboard_table])
90
+ # show_open_source_model_only.change(fn=length_margin_change, inputs=[length_margin_choices, gr.Text("main", visible=False), checkbox_show_task_categorized, show_open_source_model_only, rank_column_radio], outputs=[leaderboard_table])
91
+ # rank_column_radio.change(fn=length_margin_change, inputs=[length_margin_choices, gr.Text("main", visible=False), checkbox_show_task_categorized, show_open_source_model_only, rank_column_radio], outputs=[leaderboard_table])
92
+
93
+
94
+
95
+ def _tab_submit():
96
+ pass
97
+
98
+
99
+ def build_demo():
100
+ global original_df, available_models, gpt4t_dfs, haiku_dfs, llama_dfs
101
+
102
+ with gr.Blocks(theme=gr.themes.Soft(), css=css, js=js_light) as demo:
103
+ gr.HTML(BANNER, elem_id="banner")
104
+ # convert LAST_UPDATED to the PDT time
105
+ LAST_UPDATED = datetime.now(pytz.timezone('US/Pacific')).strftime("%Y-%m-%d %H:%M:%S")
106
+ # header_md_text = HEADER_MD.replace("{model_num}", str(len(original_df["-1"]))).replace("{LAST_UPDATED}", str(LAST_UPDATED))
107
+ # gr.Markdown(header_md_text, elem_classes="markdown-text")
108
+
109
+ with gr.Tabs(elem_classes="tab-buttons") as tabs:
110
+ with gr.TabItem("🏅 Leaderboard", elem_id="od-benchmark-tab-table", id=0):
111
+ _tab_leaderboard()
112
+
113
+ with gr.TabItem("🚀 Submit Your Results", elem_id="od-benchmark-tab-table", id=3):
114
+ _tab_submit()
115
+
116
+ with gr.TabItem("📮 About Us", elem_id="od-benchmark-tab-table", id=4):
117
+ gr.Markdown(ABOUT_MD, elem_classes="markdown-text")
118
+
119
+ with gr.Row():
120
+ with gr.Accordion("📙 Citation", open=False, elem_classes="accordion-label"):
121
+ gr.Textbox(
122
+ value=CITATION_TEXT,
123
+ lines=7,
124
+ label="Copy the BibTeX snippet to cite this source",
125
+ elem_id="citation-button",
126
+ show_copy_button=True)
127
+ # ).style(show_copy_button=True)
128
+
129
+ return demo
130
+
131
+
132
+
133
+ def data_load(result_file):
134
+ global original_df
135
+ print(f"Loading {result_file}")
136
+ column_names_main = column_names.copy()
137
+ # column_names_main.update({})
138
+ main_ordered_columns = ORDERED_COLUMN_NAMES
139
+ click_url = True
140
+ # read json file from the result_file
141
+ with open(result_file, "r") as f:
142
+ data = json.load(f)
143
+ # floatify the data, if possible
144
+ for d in data:
145
+ for k, v in d.items():
146
+ try:
147
+ d[k] = float(v)
148
+ except:
149
+ pass
150
+ original_df = pd.DataFrame(data)
151
+ original_df = post_processing(original_df, column_names_main, ordered_columns=main_ordered_columns, click_url=click_url, rank_column=RANKING_COLUMN)
152
+ # print(original_df.columns)
153
+
154
+
155
+ if __name__ == "__main__":
156
+ parser = argparse.ArgumentParser()
157
+ parser.add_argument("--share", action="store_true")
158
+ parser.add_argument("--result_file", help="Path to results table", default="ZeroEval-main/result_dirs/zebra-grid.summary.json")
159
+
160
+ args = parser.parse_args()
161
+ data_load(args.result_file)
162
+ print(original_df)
163
+ demo = build_demo()
164
+ demo.launch(share=args.share, height=3000, width="100%")
constants.py ADDED
@@ -0,0 +1,283 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ from collections import OrderedDict
3
+
4
+ DEFAULT_K = "∞"
5
+ # DEFAULT_K = "1500"
6
+
7
+ banner_url = "https://allenai.github.io/WildBench/gray_banner.png" # the same repo here.
8
+ BANNER = f'<div style="display: flex; justify-content: flex-start;"><img src="{banner_url}" alt="Banner" style="width: 40vw; min-width: 300px; max-width: 800px;"> </div>'
9
+
10
+ TITLE = "<html> <head> <style> h1 {text-align: center;} </style> </head> <body> <h1> 🦁 AI2 WildBench Leaderboard </b> </body> </html>"
11
+
12
+ WINRATE_HEATMAP = "<div><img src='https://github.com/WildEval/WildBench-Leaderboard/blob/main/gradio/pairwise_win_fractions.png?raw=true' style='width:100%;'></div>"
13
+
14
+ CITATION_TEXT = """@misc{lin2024wildbench,
15
+ title={WildBench: Benchmarking LLMs with Challenging Tasks from Real Users in the Wild},
16
+ author={Bill Yuchen Lin and Yuntian Deng and Khyathi Chandu and Faeze Brahman and Abhilasha Ravichander and Valentina Pyatkin and Nouha Dziri and Ronan Le Bras and Yejin Choi},
17
+ year={2024},
18
+ eprint={2406.04770},
19
+ archivePrefix={arXiv},
20
+ primaryClass={cs.CL},
21
+ url={https://arxiv.org/abs/2406.04770}
22
+ }
23
+ """
24
+
25
+ # make column_names as an ordered dict
26
+
27
+
28
+
29
+ column_names = OrderedDict({
30
+ "Model": "Model",
31
+ "Mode": "Mode",
32
+ "Puzzle Acc": "Puzzle Acc",
33
+ "Cell Acc": "Cell Acc",
34
+ "No answer": "No answer",
35
+ "Easy Puzzle Acc": "Easy Puzzle Acc",
36
+ "Hard Puzzle Acc": "Hard Puzzle Acc",
37
+ # "Total Puzzles": "Total Puzzles",
38
+ # "Reason Lens": "Reason Lens",
39
+ })
40
+
41
+
42
+
43
+ LEADERBOARD_REMARKS = """**WB Reward**: for each comparison (A vs B), a reward for A is **+/-1** if A is **much better/worse** than B, and **+/-0.5** if A is **slightly better/worse** than B; when there is a **Tie**, the reward is **0**.
44
+ """
45
+
46
+ # **WB Reward**: for each pairwise comparison, a reward for A is **+/-1** if A is **much better/worse** than B, and **+/-0.5** if A is **slightly better/worse** than B; 0 for a **Tie**.
47
+ # The baseline models are GPT4-Turbo, Haiku, and Llama2-70B, and Mix is the average of the three.
48
+ # **WB Score** individually scores each model based on checklists.
49
+ # Evaluator is GPT-4-Turbo.
50
+ LEADERBOARD_REMARKS_MAIN = """
51
+ """
52
+
53
+ RANKING_COLUMN = "Puzzle Acc"
54
+
55
+ ORDERED_COLUMN_NAMES = [
56
+ "Model",
57
+ "Mode",
58
+ "Puzzle Acc",
59
+ "Easy Puzzle Acc",
60
+ "Hard Puzzle Acc",
61
+ "Cell Acc",
62
+ "No answer",
63
+ ]
64
+
65
+
66
+ js_light = """
67
+ function refresh() {
68
+ const url = new URL(window.location);
69
+
70
+ if (url.searchParams.get('__theme') !== 'light') {
71
+ url.searchParams.set('__theme', 'light');
72
+ window.location.href = url.href;
73
+ }
74
+
75
+ // Find the fieldset with the given id
76
+ const fieldset = document.getElementById("rank-column-radio");
77
+
78
+ // Create a new span element with the text "Rank by:"
79
+ const rankBySpan = document.createElement("span");
80
+ rankBySpan.textContent = "Rank by: ";
81
+ rankBySpan.style.fontWeight = "bold"; // Optional: make the text bold
82
+ rankBySpan.style.fontSize = "19px"; // Larger font size
83
+ rankBySpan.style.paddingRight = "18px"; // Add padding on the right
84
+
85
+ // Wrap the span and the labels in a flex container
86
+ const flexContainer = document.createElement("div");
87
+ flexContainer.style.display = "flex";
88
+ flexContainer.style.alignItems = "center";
89
+
90
+ // Insert the rankBySpan at the beginning of the flex container
91
+ flexContainer.appendChild(rankBySpan);
92
+
93
+ // Move all existing labels into the flex container
94
+ while (fieldset.firstChild) {
95
+ flexContainer.appendChild(fieldset.firstChild);
96
+ }
97
+
98
+ // Append the flex container back to the fieldset
99
+ fieldset.appendChild(flexContainer);
100
+ }
101
+ """
102
+
103
+ js_code = """
104
+ function scroll_top() {
105
+ console.log("Hello from Gradio!");
106
+ const bubbles = document.querySelectorAll('.bubble-wrap');
107
+ bubbles.forEach((bubble, index) => {
108
+ setTimeout(() => {
109
+ bubble.scrollTop = 0;
110
+ }, index * 100); // Delay of 100ms between each iteration
111
+ });
112
+
113
+ }
114
+ """
115
+
116
+
117
+ TASK_TYPE_STR = "**Tasks**: Info seeking (**InfoSek**), Creative Writing (**CrtWrt**), Coding&Debugging (**Code**), Reasoning (**Reason**), Editing (**Edit**), **Math**, Planning (**Plan**), Brainstorming (**Brnstrm**), Role playing (**RolPly**), Advice seeking (**AdvSek**), Data Analysis (**DataAna**)"
118
+
119
+ css = """
120
+
121
+
122
+
123
+ code {
124
+ font-size: large;
125
+ }
126
+ footer {visibility: hidden}
127
+ .top-left-LP{
128
+ margin-top: 6px;
129
+ margin-left: 5px;
130
+ }
131
+ .no_margin{
132
+ margin-top: 0px;
133
+ margin-left: 0px;
134
+ margin-right: 0px;
135
+ margin-bottom: 0px;
136
+ padding-top: 0px;
137
+ padding-left: 0px;
138
+ padding-right: 0px;
139
+ padding-bottom: 0px;
140
+ }
141
+ .markdown-text{font-size: 14pt}
142
+ .markdown-text-tiny{font-size: 10pt}
143
+ .markdown-text-small{font-size: 13pt}
144
+ .markdown-text-tiny{font-size: 12pt}
145
+ .markdown-text-tiny-red{
146
+ font-size: 12pt;
147
+ color: red;
148
+ background-color: yellow;
149
+ font-color: red;
150
+ font-weight: bold;
151
+ }
152
+ th {
153
+ text-align: center;
154
+ font-size: 17px; /* Adjust the font size as needed */
155
+ }
156
+ td {
157
+ font-size: 15px; /* Adjust the font size as needed */
158
+ text-align: center;
159
+ }
160
+
161
+ .sample_button{
162
+ border: 2px solid #000000;
163
+ border-radius: 10px;
164
+ padding: 10px;
165
+ font-size: 17pt;
166
+ font-weight: bold;
167
+ margin: 5px;
168
+ background-color: #D8BFD8;
169
+ }
170
+
171
+ .chat-common{
172
+ height: auto;
173
+ max-height: 400px;
174
+ min-height: 100px;
175
+ }
176
+ .chat-specific{
177
+ height: auto;
178
+ max-height: 600px;
179
+ min-height: 200px;
180
+ }
181
+ #od-benchmark-tab-table-button{
182
+ font-size: 15pt;
183
+ font-weight: bold;
184
+ }
185
+
186
+ .btn_boderline{
187
+ border: 1px solid #000000;
188
+ border-radius: 5px;
189
+ padding: 5px;
190
+ margin: 5px;
191
+ font-size: 15pt;
192
+ font-weight: bold;
193
+ }
194
+
195
+ .btn_boderline_next{
196
+ border: 0.1px solid #000000;
197
+ border-radius: 5px;
198
+ padding: 5px;
199
+ margin: 5px;
200
+ font-size: 15pt;
201
+ font-weight: bold;
202
+ }
203
+
204
+ .btn_boderline_gray{
205
+ border: 0.5px solid gray;
206
+ border-radius: 5px;
207
+ padding: 5px;
208
+ margin: 5px;
209
+ font-size: 15pt;
210
+ font-weight: italic;
211
+ }
212
+ .btn_boderline_selected{
213
+ border: 2px solid purple;
214
+ background-color: #f2f2f2;
215
+ border-radius: 5px;
216
+ padding: 5px;
217
+ margin: 5px;
218
+ font-size: 15pt;
219
+ font-weight: bold;
220
+ }
221
+ .accordion-label button span{
222
+ font-size: 14pt;
223
+ font-weight: bold;
224
+ }
225
+
226
+ #show-task-categorized span{
227
+ font-size: 13pt;
228
+ font-weight: bold;
229
+ }
230
+
231
+ #show-open-source-models span{
232
+ font-size: 13pt;
233
+ font-weight: bold;
234
+ }
235
+
236
+ #select-models span{
237
+ font-size: 10pt;
238
+ }
239
+
240
+ #select-tasks span{
241
+ font-size: 10pt;
242
+ }
243
+
244
+
245
+ .markdown-text-details{
246
+ margin: 10px;
247
+ padding: 10px;
248
+ }
249
+
250
+
251
+ button.selected[role="tab"][aria-selected="true"] {
252
+ font-size: 18px; /* or any other size you prefer */
253
+ font-weight: bold;
254
+ }
255
+
256
+ #od-benchmark-tab-table-ablation-button {
257
+ font-size: larger; /* Adjust the font size as needed */
258
+ }
259
+
260
+
261
+ .plotly-plot{
262
+ height: auto;
263
+ max-height: 600px;
264
+ min-height: 600px;
265
+ }
266
+
267
+ #length-margin-radio{
268
+ font-size: 10pt;
269
+ # padding: 0px;
270
+ # margin: 1px;
271
+ }
272
+
273
+ #show-task-categorized{
274
+ font-size: 12pt;
275
+ font-decoration: bold;
276
+ }
277
+
278
+ #show-open-source-models{
279
+ font-size: 12pt;
280
+ font-decoration: bold;
281
+ }
282
+ """
283
+
data_utils.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_dataset, Dataset
2
+ import os
3
+ from datasets import load_dataset
4
+ from datasets.utils.logging import disable_progress_bar
5
+ from constants import column_names, RANKING_COLUMN, ORDERED_COLUMN_NAMES
6
+ from utils_display import make_clickable_model
7
+
8
+ import random
9
+ disable_progress_bar()
10
+ import math
11
+ import json
12
+ from tqdm import tqdm
13
+ import numpy as np
14
+
15
+ id_to_data = None
16
+ model_len_info = None
17
+ bench_data = None
18
+ eval_results = None
19
+ score_eval_results = None
20
+
21
+ # Formats the columns
22
+ def formatter(x):
23
+ if type(x) is str:
24
+ x = x
25
+ else:
26
+ x = round(x, 1)
27
+ return x
28
+
29
+
30
+ def post_processing(df, column_names, rank_column=RANKING_COLUMN, ordered_columns=ORDERED_COLUMN_NAMES, click_url=True):
31
+ for col in df.columns:
32
+ if col == "Model" and click_url:
33
+ df[col] = df[col].apply(lambda x: x.replace(x, make_clickable_model(x)))
34
+ else:
35
+ df[col] = df[col].apply(formatter) # For numerical values
36
+ if "Elo" in col:
37
+ df[col] = df[col].replace('-', np.nan).astype(float)
38
+
39
+
40
+ df.rename(columns=column_names, inplace=True)
41
+ list_columns = [col for col in ordered_columns if col in df.columns]
42
+ df = df[list_columns]
43
+ if rank_column in df.columns:
44
+ df.sort_values(by=rank_column, inplace=True, ascending=False)
45
+ return df
46
+
init.py ADDED
File without changes
model_info.json ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "Qwen2-72B-Instruct": {"pretty_name": "Qwen2-72B-Instruct", "hf_model_id": "Qwen/Qwen2-72B-Instruct"},
3
+ "Qwen2-7B-Instruct": {"pretty_name": "Qwen2-7B-Instruct", "hf_model_id": "Qwen/Qwen2-7B-Instruct"},
4
+ "Qwen1.5-72B-Chat-greedy": {"pretty_name": "Qwen1.5-72B-Chat", "hf_model_id": "Qwen/Qwen1.5-72B-Chat"},
5
+ "Qwen1.5-7B-Chat": {"pretty_name": "Qwen1.5-7B-Chat", "hf_model_id": "Qwen/Qwen1.5-7B-Chat"},
6
+ "Meta-Llama-3-8B-Instruct": {"pretty_name": "Llama-3-8B-Instruct", "hf_model_id": "meta-llama/Meta-Llama-3-8B-Instruct"},
7
+ "Meta-Llama-3-70B-Instruct": {"pretty_name": "Llama-3-70B-Instruct", "hf_model_id": "meta-llama/Meta-Llama-3-70B-Instruct"},
8
+ "Llama-2-13b-chat-hf": {"pretty_name": "Llama-2-13B-chat", "hf_model_id": "meta-llama/Llama-2-13b-chat-hf"},
9
+ "Llama-2-70b-chat-hf": {"pretty_name": "Llama-2-70B-chat", "hf_model_id": "meta-llama/Llama-2-70b-chat-hf"},
10
+ "Llama-2-7b-chat-hf": {"pretty_name": "Llama-2-7B-chat", "hf_model_id": "meta-llama/Llama-2-7b-chat-hf"},
11
+ "Mistral-7B-Instruct-v0.1": {"pretty_name": "Mistral-7B-Instruct", "hf_model_id": "mistralai/Mistral-7B-Instruct-v0.1"},
12
+ "Mistral-7B-Instruct-v0.2": {"pretty_name": "Mistral-7B-Instruct-v0.2", "hf_model_id": "mistralai/Mistral-7B-Instruct-v0.2"},
13
+ "Mixtral-8x7B-Instruct-v0.1": {"pretty_name": "Mixtral-8x7B-Instruct", "hf_model_id": "mistralai/Mixtral-8x7B-Instruct-v0.1"},
14
+ "command-r": {"pretty_name": "Command-R", "hf_model_id": "https://cohere.com/command"},
15
+ "command-r-plus": {"pretty_name": "Command-R-Plus", "hf_model_id": "https://cohere.com/command"},
16
+ "Phi-3-medium-128k-instruct": {"pretty_name": "Phi-3-medium-128k", "hf_model_id": "microsoft/Phi-3-medium-128k-instruct"},
17
+ "Phi-3-mini-128k-instruct": {"pretty_name": "Phi-3-mini-128k", "hf_model_id": "microsoft/Phi-3-mini-128k-instruct"},
18
+ "Nous-Hermes-2-Mixtral-8x7B-DPO": {"pretty_name": "Hermes-2-Mixtral-8x7B-DPO", "hf_model_id": "NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO"},
19
+ "Hermes-2-Theta-Llama-3-8B": {"pretty_name": "Hermes-2-Theta-Llama-3-8B", "hf_model_id": "NousResearch/Hermes-2-Theta-Llama-3-8B"},
20
+ "yi-large": {"pretty_name": "Yi-Large", "hf_model_id": "https://platform.01.ai/"},
21
+ "yi-large-preview": {"pretty_name": "Yi-Large-Preview", "hf_model_id": "https://platform.01.ai/"},
22
+ "Yi-1.5-34B-Chat": {"pretty_name": "Yi-1.5-34B-Chat", "hf_model_id": "01-ai/Yi-1.5-34B-Chat"},
23
+ "Yi-1.5-9B-Chat": {"pretty_name": "Yi-1.5-9B-Chat", "hf_model_id": "01-ai/Yi-1.5-9B-Chat"},
24
+ "Yi-1.5-6B-Chat": {"pretty_name": "Yi-1.5-6B-Chat", "hf_model_id": "01-ai/Yi-1.5-6B-Chat"},
25
+ "reka-flash-20240226": {"pretty_name": "Reka Flash", "hf_model_id": "https://www.reka.ai/"},
26
+ "reka-core-20240501": {"pretty_name": "Reka Core", "hf_model_id": "https://www.reka.ai/"},
27
+ "reka-edge": {"pretty_name": "Reka Edge", "hf_model_id": "https://www.reka.ai/"},
28
+ "gemini-1.5-pro": {"pretty_name": "Gemini 1.5 Pro", "hf_model_id": "https://blog.google/technology/ai/google-gemini-ai/"},
29
+ "gemini-1.5-flash": {"pretty_name": "Gemini 1.5 Flash", "hf_model_id": "https://blog.google/technology/ai/google-gemini-ai/"},
30
+ "gemma-7b-it": {"pretty_name": "Gemma-7B-it", "hf_model_id": "google/gemma-7b-it"},
31
+ "gemma-2b-it": {"pretty_name": "Gemma-2B-it", "hf_model_id": "google/gemma-2b-it"},
32
+ "gpt-3.5-turbo-0125": {"pretty_name": "gpt-3.5-turbo-0125", "hf_model_id": "https://platform.openai.com/"},
33
+ "gpt-4-0125-preview": {"pretty_name": "gpt-4-0125-preview", "hf_model_id": "https://platform.openai.com/"},
34
+ "gpt-4o-2024-05-13": {"pretty_name": "gpt-4o-2024-05-13", "hf_model_id": "https://platform.openai.com/"},
35
+ "gpt-4-turbo-2024-04-09": {"pretty_name": "gpt-4-turbo-2024-04-09", "hf_model_id": "https://platform.openai.com/"},
36
+ "gpt-4-0314": {"pretty_name": "gpt-4-0314", "hf_model_id": "https://platform.openai.com/"},
37
+ "tulu-2-dpo-70b": {"pretty_name": "Tulu-2-dpo-70b", "hf_model_id": "allenai/tulu-2-dpo-70b"},
38
+ "zephyr-7b-beta": {"pretty_name": "Zephyr-7b-beta", "hf_model_id": "HuggingFaceH4/zephyr-7b-beta"},
39
+ "mistral-large-2402": {"pretty_name": "Mistral-Large", "hf_model_id": "https://mistral.ai/news/mistral-large/"},
40
+ "claude-3-haiku-20240307": {"pretty_name": "Claude 3 Haiku", "hf_model_id": "https://www.anthropic.com/claude"},
41
+ "claude-3-opus-20240229": {"pretty_name": "Claude 3 Opus", "hf_model_id": "https://www.anthropic.com/claude"},
42
+ "claude-3-sonnet-20240229": {"pretty_name": "Claude 3 Sonnet", "hf_model_id": "https://www.anthropic.com/claude"},
43
+ "claude-3-5-sonnet-20240620": {"pretty_name": "Claude 3.5 Sonnet", "hf_model_id": "https://www.anthropic.com/claude"},
44
+ "Starling-LM-7B-beta": {"pretty_name": "StarlingLM-7B-beta", "hf_model_id": "Nexusflow/Starling-LM-7B-beta"},
45
+ "dbrx-instruct": {"pretty_name": "DBRX Instruct", "hf_model_id": "databricks/dbrx-instruct"},
46
+ "Llama-3-Instruct-8B-SimPO": {"pretty_name": "Llama3-Inst-8B-SimPO", "hf_model_id": "princeton-nlp/Llama-3-Instruct-8B-SimPO"},
47
+ "Llama-3-Instruct-8B-SimPO-ExPO": {"pretty_name": "Llama3-Inst-8B-SimPO-ExPO", "hf_model_id": "chujiezheng/Llama-3-Instruct-8B-SimPO-ExPO"},
48
+ "Starling-LM-7B-beta-ExPO": {"pretty_name": "Starling-LM-7B-beta-ExPO", "hf_model_id": "chujiezheng/Starling-LM-7B-beta-ExPO"},
49
+ "SELM-Zephyr-7B-iter-3": {"pretty_name": "SELM (Zephyr-7B-iter3)", "hf_model_id": "ZhangShenao/SELM-Zephyr-7B-iter-3"},
50
+ "deepseekv2-chat": {"pretty_name": "DeepSeek-V2-Chat", "hf_model_id": "deepseek-ai/DeepSeek-V2-Chat"},
51
+ "deepseek-coder-v2": {"pretty_name": "DeepSeek-Coder-V2-Inst", "hf_model_id": "deepseek-ai/DeepSeek-Coder-V2-Instruct"},
52
+ "deepseek-chat": {"pretty_name": "DeepSeek-V2-Chat", "hf_model_id": "https://platform.deepseek.com/api-docs/api/deepseek-api/", "open": true},
53
+ "deepseek-coder": {"pretty_name": "DeepSeek-Coder-V2", "hf_model_id": "https://platform.deepseek.com/api-docs/api/deepseek-api/", "open": true},
54
+ "gemma-2-27b-it@nvidia": {"pretty_name": "Gemma-2-27B-it", "hf_model_id": "https://huggingface.co/google/gemma-2-27b-it"},
55
+ "gemma-2-9b-it@nvidia": {"pretty_name": "Gemma-2-9B-it", "hf_model_id": "https://huggingface.co/google/gemma-2-9b-it"},
56
+ "neo_7b_instruct_v0.1": {"pretty_name": "Neo-7B-Instruct", "hf_model_id": "m-a-p/neo_7b_instruct_v0.1"},
57
+ "Yi-34B-Chat": {"pretty_name": "Yi-34B-Chat", "hf_model_id": "01-ai/Yi-34B-chat"},
58
+ "vicuna-13b-v1.5": {"pretty_name": "Vicuna-13b-v1.5", "hf_model_id": "lmsys/vicuna-13b-v1.5"},
59
+ "zephyr-7b-gemma-v0.1": {"pretty_name": "Zephyr-7b-Gemma", "hf_model_id": "HuggingFaceH4/zephyr-7b-gemma-v0.1"},
60
+ "glm-4-9b-chat": {"pretty_name": "GLM-4-9B-Chat", "hf_model_id": "THUDM/glm-4-9b-chat"},
61
+ "neo_7b_instruct_v0.1-ExPO": {"pretty_name": "Neo-7B-Instruct-ExPO", "hf_model_id": "chujiezheng/neo_7b_instruct_v0.1-ExPO"},
62
+ "SELM-Llama-3-8B-Instruct-iter-3": {"pretty_name": "SELM (Llama3-8B-Inst-iter3)", "hf_model_id": "ZhangShenao/SELM-Llama-3-8B-Instruct-iter-3"},
63
+ "nemotron-4-340b-instruct": {"pretty_name": "Nemotron-4-340B-Instruct", "hf_model_id": "nvidia/Nemotron-4-340B-Instruct"},
64
+ "Llama-3-8B-Magpie-Align-v0.1": {"pretty_name": "Magpie-8B-Align-v0.1", "hf_model_id": "Magpie-Align/Llama-3-8B-Magpie-Align-v0.1"}
65
+ }
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ gradio[oauth]==4.19.2
2
+ datasets
3
+ toolz==0.12.1
4
+ plotly
style.css ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ body {
2
+ font-family: -apple-system, BlinkMacSystemFont, "Arial", sans-serif;
3
+ }
4
+
5
+ h1 {
6
+ font-size: 16px;
7
+ margin-top: 0;
8
+ }
9
+
10
+ p {
11
+ color: rgb(107, 114, 128);
12
+ font-size: 15px;
13
+ margin-bottom: 10px;
14
+ margin-top: 5px;
15
+ }
16
+
17
+ .card {
18
+ max-width: 620px;
19
+ margin: 0 auto;
20
+ padding: 16px;
21
+ border: 1px solid lightgray;
22
+ border-radius: 16px;
23
+ }
24
+
25
+ .card p:last-child {
26
+ margin-bottom: 0;
27
+ }
themes.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+ from typing import Iterable
3
+ import gradio as gr
4
+ from gradio.themes.base import Base
5
+ from gradio.themes.utils import colors, fonts, sizes
6
+ import time
7
+
8
+ class Seafoam(Base):
9
+ def __init__(
10
+ self,
11
+ *,
12
+ primary_hue: colors.Color | str = colors.blue,
13
+ secondary_hue: colors.Color | str = colors.gray,
14
+ neutral_hue: colors.Color | str = colors.gray,
15
+ spacing_size: sizes.Size | str = sizes.spacing_md,
16
+ radius_size: sizes.Size | str = sizes.radius_md,
17
+ text_size: sizes.Size | str = sizes.text_lg,
18
+ font: fonts.Font
19
+ | str
20
+ | Iterable[fonts.Font | str] = (
21
+ fonts.GoogleFont("Quicksand"),
22
+ "ui-sans-serif",
23
+ "sans-serif",
24
+ ),
25
+ font_mono: fonts.Font
26
+ | str
27
+ | Iterable[fonts.Font | str] = (
28
+ fonts.GoogleFont("IBM Plex Mono"),
29
+ "ui-monospace",
30
+ "monospace",
31
+ ),
32
+ ):
33
+ super().__init__(
34
+ primary_hue=primary_hue,
35
+ secondary_hue=secondary_hue,
36
+ neutral_hue=neutral_hue,
37
+ spacing_size=spacing_size,
38
+ radius_size=radius_size,
39
+ text_size=text_size,
40
+ font=font,
41
+ font_mono=font_mono,
42
+ )
43
+
44
+
45
+ seafoam = Seafoam()
update_data.sh ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ TARGET_DIR="ZeroEval-main"
2
+
3
+ rm -r $TARGET_DIR
4
+ # Download the ZIP file
5
+ curl -L -o zeroeval.zip https://github.com/yuchenlin/ZeroEval/archive/refs/heads/main.zip
6
+ unzip zeroeval.zip
7
+ rm zeroeval.zip
8
+
9
+ #!/bin/bash
10
+
11
+ # Define the target directory and the exception folder
12
+ EXCEPTION_FOLDER="result_dirs"
13
+
14
+ # Ensure the target directory exists
15
+ if [ -d "$TARGET_DIR" ]; then
16
+ # Loop through each item in the target directory
17
+ for item in "$TARGET_DIR"/*; do
18
+ # Check if it is not the exception folder
19
+ if [ "$(basename "$item")" != "$EXCEPTION_FOLDER" ]; then
20
+ # Remove the item (file or directory)
21
+ rm -rf "$item"
22
+ echo "Removed: $item"
23
+ fi
24
+ done
25
+ else
26
+ echo "Target directory does not exist: $TARGET_DIR"
27
+ fi
28
+
29
+ # only keep the result_dirs/zebra-grid under result_dirs folder; remove all other sub-folders under result_dirs
30
+ # Remove all subdirectories in result_dirs except zebra-grid
31
+ find "$TARGET_DIR/result_dirs" -maxdepth 1 -type d ! -name 'zebra-grid' ! -name 'result_dirs' -exec rm -rf {} +
32
+
33
+ rm -rf $TARGET_DIR/.github
34
+ rm -rf $TARGET_DIR/.gitignore
35
+
36
+
37
+ # tables
38
+
39
+
40
+ # bash update_table.sh
update_table.sh ADDED
File without changes
utils_display.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+
3
+ with open("model_info.json", "r") as f:
4
+ model_info = json.load(f)
5
+
6
+ def make_clickable_model(model_name):
7
+ global model_info
8
+ modified_model_name = model_name
9
+ if model_name in model_info:
10
+ if model_info[model_name]["hf_model_id"].startswith("http"):
11
+ link = model_info[model_name]["hf_model_id"]
12
+ modified_model_name = f'🔒 <a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_info[model_name]["pretty_name"]}</a>'
13
+ else:
14
+ link = f"https://huggingface.co/{model_info[model_name]['hf_model_id']}"
15
+ modified_model_name = f'🔑 <a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_info[model_name]["pretty_name"]}</a>'
16
+ if "Neo-7B" in modified_model_name:
17
+ # models that are fully open source
18
+ modified_model_name = modified_model_name.replace("🔑", "💎🔑")
19
+
20
+ if "🚨</a>" in modified_model_name:
21
+ modified_model_name = modified_model_name.replace(' 🚨</a>', '</a> 🚨')
22
+ # if model_name in ["gpt-4-turbo-2024-04-09", "Llama-2-70b-chat-hf", "claude-3-haiku-20240307"]:
23
+ # modified_model_name = modified_model_name.replace('style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"', 'style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted; font-weight: bold; background-color: var(--link-background-color);"')
24
+ return modified_model_name
25
+
26
+
27
+ def styled_error(error):
28
+ return f"<p style='color: red; font-size: 20px; text-align: center;'>{error}</p>"
29
+
30
+ def styled_warning(warn):
31
+ return f"<p style='color: orange; font-size: 20px; text-align: center;'>{warn}</p>"
32
+
33
+ def styled_message(message):
34
+ return f"<p style='color: green; font-size: 20px; text-align: center;'>{message}</p>"