Update README.md
Browse files
README.md
CHANGED
@@ -48,7 +48,9 @@ text = tokenizer.batch_decode(outputs)[0]
|
|
48 |
print(text)
|
49 |
```
|
50 |
|
|
|
51 |
|
|
|
52 |
@misc{open-llm-leaderboard,
|
53 |
author = {Edward Beeching and Clémentine Fourrier and Nathan Habib and Sheon Han and Nathan Lambert and Nazneen Rajani and Omar Sanseviero and Lewis Tunstall and Thomas Wolf},
|
54 |
title = {Open LLM Leaderboard},
|
@@ -56,6 +58,8 @@ print(text)
|
|
56 |
publisher = {Hugging Face},
|
57 |
howpublished = "\url{https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard}"
|
58 |
}
|
|
|
|
|
59 |
@software{eval-harness,
|
60 |
author = {Gao, Leo and
|
61 |
Tow, Jonathan and
|
@@ -82,6 +86,8 @@ print(text)
|
|
82 |
doi = {10.5281/zenodo.5371628},
|
83 |
url = {https://doi.org/10.5281/zenodo.5371628}
|
84 |
}
|
|
|
|
|
85 |
@misc{clark2018think,
|
86 |
title={Think you have Solved Question Answering? Try ARC, the AI2 Reasoning Challenge},
|
87 |
author={Peter Clark and Isaac Cowhey and Oren Etzioni and Tushar Khot and Ashish Sabharwal and Carissa Schoenick and Oyvind Tafjord},
|
@@ -90,6 +96,8 @@ print(text)
|
|
90 |
archivePrefix={arXiv},
|
91 |
primaryClass={cs.AI}
|
92 |
}
|
|
|
|
|
93 |
@misc{zellers2019hellaswag,
|
94 |
title={HellaSwag: Can a Machine Really Finish Your Sentence?},
|
95 |
author={Rowan Zellers and Ari Holtzman and Yonatan Bisk and Ali Farhadi and Yejin Choi},
|
@@ -98,6 +106,8 @@ print(text)
|
|
98 |
archivePrefix={arXiv},
|
99 |
primaryClass={cs.CL}
|
100 |
}
|
|
|
|
|
101 |
@misc{hendrycks2021measuring,
|
102 |
title={Measuring Massive Multitask Language Understanding},
|
103 |
author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},
|
@@ -106,6 +116,8 @@ print(text)
|
|
106 |
archivePrefix={arXiv},
|
107 |
primaryClass={cs.CY}
|
108 |
}
|
|
|
|
|
109 |
@misc{lin2022truthfulqa,
|
110 |
title={TruthfulQA: Measuring How Models Mimic Human Falsehoods},
|
111 |
author={Stephanie Lin and Jacob Hilton and Owain Evans},
|
@@ -114,6 +126,8 @@ print(text)
|
|
114 |
archivePrefix={arXiv},
|
115 |
primaryClass={cs.CL}
|
116 |
}
|
|
|
|
|
117 |
@misc{DBLP:journals/corr/abs-1907-10641,
|
118 |
title={{WINOGRANDE:} An Adversarial Winograd Schema Challenge at Scale},
|
119 |
author={Keisuke Sakaguchi and Ronan Le Bras and Chandra Bhagavatula and Yejin Choi},
|
@@ -122,6 +136,8 @@ print(text)
|
|
122 |
archivePrefix={arXiv},
|
123 |
primaryClass={cs.CL}
|
124 |
}
|
|
|
|
|
125 |
@misc{DBLP:journals/corr/abs-2110-14168,
|
126 |
title={Training Verifiers to Solve Math Word Problems},
|
127 |
author={Karl Cobbe and
|
@@ -141,4 +157,5 @@ print(text)
|
|
141 |
archivePrefix={arXiv},
|
142 |
primaryClass={cs.CL}
|
143 |
}
|
|
|
144 |
|
|
|
48 |
print(text)
|
49 |
```
|
50 |
|
51 |
+
## Citations
|
52 |
|
53 |
+
```
|
54 |
@misc{open-llm-leaderboard,
|
55 |
author = {Edward Beeching and Clémentine Fourrier and Nathan Habib and Sheon Han and Nathan Lambert and Nazneen Rajani and Omar Sanseviero and Lewis Tunstall and Thomas Wolf},
|
56 |
title = {Open LLM Leaderboard},
|
|
|
58 |
publisher = {Hugging Face},
|
59 |
howpublished = "\url{https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard}"
|
60 |
}
|
61 |
+
```
|
62 |
+
```
|
63 |
@software{eval-harness,
|
64 |
author = {Gao, Leo and
|
65 |
Tow, Jonathan and
|
|
|
86 |
doi = {10.5281/zenodo.5371628},
|
87 |
url = {https://doi.org/10.5281/zenodo.5371628}
|
88 |
}
|
89 |
+
```
|
90 |
+
```
|
91 |
@misc{clark2018think,
|
92 |
title={Think you have Solved Question Answering? Try ARC, the AI2 Reasoning Challenge},
|
93 |
author={Peter Clark and Isaac Cowhey and Oren Etzioni and Tushar Khot and Ashish Sabharwal and Carissa Schoenick and Oyvind Tafjord},
|
|
|
96 |
archivePrefix={arXiv},
|
97 |
primaryClass={cs.AI}
|
98 |
}
|
99 |
+
```
|
100 |
+
```
|
101 |
@misc{zellers2019hellaswag,
|
102 |
title={HellaSwag: Can a Machine Really Finish Your Sentence?},
|
103 |
author={Rowan Zellers and Ari Holtzman and Yonatan Bisk and Ali Farhadi and Yejin Choi},
|
|
|
106 |
archivePrefix={arXiv},
|
107 |
primaryClass={cs.CL}
|
108 |
}
|
109 |
+
```
|
110 |
+
```
|
111 |
@misc{hendrycks2021measuring,
|
112 |
title={Measuring Massive Multitask Language Understanding},
|
113 |
author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},
|
|
|
116 |
archivePrefix={arXiv},
|
117 |
primaryClass={cs.CY}
|
118 |
}
|
119 |
+
```
|
120 |
+
```
|
121 |
@misc{lin2022truthfulqa,
|
122 |
title={TruthfulQA: Measuring How Models Mimic Human Falsehoods},
|
123 |
author={Stephanie Lin and Jacob Hilton and Owain Evans},
|
|
|
126 |
archivePrefix={arXiv},
|
127 |
primaryClass={cs.CL}
|
128 |
}
|
129 |
+
```
|
130 |
+
```
|
131 |
@misc{DBLP:journals/corr/abs-1907-10641,
|
132 |
title={{WINOGRANDE:} An Adversarial Winograd Schema Challenge at Scale},
|
133 |
author={Keisuke Sakaguchi and Ronan Le Bras and Chandra Bhagavatula and Yejin Choi},
|
|
|
136 |
archivePrefix={arXiv},
|
137 |
primaryClass={cs.CL}
|
138 |
}
|
139 |
+
```
|
140 |
+
```
|
141 |
@misc{DBLP:journals/corr/abs-2110-14168,
|
142 |
title={Training Verifiers to Solve Math Word Problems},
|
143 |
author={Karl Cobbe and
|
|
|
157 |
archivePrefix={arXiv},
|
158 |
primaryClass={cs.CL}
|
159 |
}
|
160 |
+
```
|
161 |
|