ekurtic
commited on
Commit
•
356f10f
1
Parent(s):
32b6206
add openllm-v2
Browse files
README.md
CHANGED
@@ -162,11 +162,44 @@ This version of the lm-evaluation-harness includes versions of MMLU, ARC-Challen
|
|
162 |
<tr>
|
163 |
<td><strong>Arena Hard</strong>
|
164 |
</td>
|
165 |
-
<td>85.0
|
166 |
</td>
|
167 |
-
<td>84.5
|
168 |
</td>
|
169 |
-
<td>99.41
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
170 |
</td>
|
171 |
</tr>
|
172 |
<tr>
|
@@ -243,7 +276,6 @@ This version of the lm-evaluation-harness includes versions of MMLU, ARC-Challen
|
|
243 |
<td>102.05%
|
244 |
</td>
|
245 |
</tr>
|
246 |
-
<tr>
|
247 |
<td><strong>Average</strong>
|
248 |
</td>
|
249 |
<td><strong>80.13</strong>
|
@@ -252,7 +284,6 @@ This version of the lm-evaluation-harness includes versions of MMLU, ARC-Challen
|
|
252 |
</td>
|
253 |
<td><strong>100.2%</strong>
|
254 |
</td>
|
255 |
-
</tr>
|
256 |
<tr>
|
257 |
<td><strong>OpenLLM v2</strong>
|
258 |
</td>
|
@@ -260,11 +291,11 @@ This version of the lm-evaluation-harness includes versions of MMLU, ARC-Challen
|
|
260 |
<tr>
|
261 |
<td>MMLU-Pro (5-shot)
|
262 |
</td>
|
263 |
-
<td>
|
264 |
</td>
|
265 |
-
<td>
|
266 |
</td>
|
267 |
-
<td>
|
268 |
</td>
|
269 |
</tr>
|
270 |
<tr>
|
@@ -280,11 +311,11 @@ This version of the lm-evaluation-harness includes versions of MMLU, ARC-Challen
|
|
280 |
<tr>
|
281 |
<td>BBH (3-shot)
|
282 |
</td>
|
283 |
-
<td>
|
284 |
</td>
|
285 |
-
<td>
|
286 |
</td>
|
287 |
-
<td>
|
288 |
</td>
|
289 |
</tr>
|
290 |
<tr>
|
@@ -297,16 +328,6 @@ This version of the lm-evaluation-harness includes versions of MMLU, ARC-Challen
|
|
297 |
<td>91.32%
|
298 |
</td>
|
299 |
</tr>
|
300 |
-
<tr>
|
301 |
-
<td>GPQA (0-shot)
|
302 |
-
</td>
|
303 |
-
<td>34.05
|
304 |
-
</td>
|
305 |
-
<td>35.97
|
306 |
-
</td>
|
307 |
-
<td>105.63%
|
308 |
-
</td>
|
309 |
-
</tr>
|
310 |
<tr>
|
311 |
<td>MuSR (0-shot)
|
312 |
</td>
|
@@ -320,11 +341,11 @@ This version of the lm-evaluation-harness includes versions of MMLU, ARC-Challen
|
|
320 |
<tr>
|
321 |
<td><strong>Average</strong>
|
322 |
</td>
|
323 |
-
<td><strong>
|
324 |
</td>
|
325 |
-
<td><strong>
|
326 |
</td>
|
327 |
-
<td><strong>
|
328 |
</td>
|
329 |
</tr>
|
330 |
</table>
|
|
|
162 |
<tr>
|
163 |
<td><strong>Arena Hard</strong>
|
164 |
</td>
|
165 |
+
<td><strong>85.0</strong>
|
166 |
</td>
|
167 |
+
<td><strong>84.5</strong>
|
168 |
</td>
|
169 |
+
<td><strong>99.41%</strong>
|
170 |
+
</td>
|
171 |
+
</tr>
|
172 |
+
<tr>
|
173 |
+
<td><strong>OpenLLM Leaderboard v1</strong>
|
174 |
+
</td>
|
175 |
+
<td><strong>80.13</strong>
|
176 |
+
</td>
|
177 |
+
<td><strong>80.29</strong>
|
178 |
+
</td>
|
179 |
+
<td><strong>100.2%</strong>
|
180 |
+
</td>
|
181 |
+
</tr>
|
182 |
+
<tr>
|
183 |
+
<td><strong>OpenLLM Leaderboard v2</strong>
|
184 |
+
</td>
|
185 |
+
<td><strong>40.25</strong>
|
186 |
+
</td>
|
187 |
+
<td><strong>39.82</strong>
|
188 |
+
</td>
|
189 |
+
<td><strong>98.93%</strong>
|
190 |
+
</td>
|
191 |
+
</tr>
|
192 |
+
</table>
|
193 |
+
|
194 |
+
<table>
|
195 |
+
<tr>
|
196 |
+
<td><strong>Benchmark (per-task breakdown)</strong>
|
197 |
+
</td>
|
198 |
+
<td><strong>nvidia/Llama-3.1-Nemotron-70B-Instruct-HF</strong>
|
199 |
+
</td>
|
200 |
+
<td><strong>neuralmagic/Llama-3.1-Nemotron-70B-Instruct-HF-FP8-dynamic (this model)</strong>
|
201 |
+
</td>
|
202 |
+
<td><strong>Recovery</strong>
|
203 |
</td>
|
204 |
</tr>
|
205 |
<tr>
|
|
|
276 |
<td>102.05%
|
277 |
</td>
|
278 |
</tr>
|
|
|
279 |
<td><strong>Average</strong>
|
280 |
</td>
|
281 |
<td><strong>80.13</strong>
|
|
|
284 |
</td>
|
285 |
<td><strong>100.2%</strong>
|
286 |
</td>
|
|
|
287 |
<tr>
|
288 |
<td><strong>OpenLLM v2</strong>
|
289 |
</td>
|
|
|
291 |
<tr>
|
292 |
<td>MMLU-Pro (5-shot)
|
293 |
</td>
|
294 |
+
<td>43.45
|
295 |
</td>
|
296 |
+
<td>42.99
|
297 |
</td>
|
298 |
+
<td>98.94%
|
299 |
</td>
|
300 |
</tr>
|
301 |
<tr>
|
|
|
311 |
<tr>
|
312 |
<td>BBH (3-shot)
|
313 |
</td>
|
314 |
+
<td>47.12
|
315 |
</td>
|
316 |
+
<td>46.88
|
317 |
</td>
|
318 |
+
<td>99.5%
|
319 |
</td>
|
320 |
</tr>
|
321 |
<tr>
|
|
|
328 |
<td>91.32%
|
329 |
</td>
|
330 |
</tr>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
331 |
<tr>
|
332 |
<td>MuSR (0-shot)
|
333 |
</td>
|
|
|
341 |
<tr>
|
342 |
<td><strong>Average</strong>
|
343 |
</td>
|
344 |
+
<td><strong>40.25</strong>
|
345 |
</td>
|
346 |
+
<td><strong>39.82</strong>
|
347 |
</td>
|
348 |
+
<td><strong>98.93%</strong>
|
349 |
</td>
|
350 |
</tr>
|
351 |
</table>
|