陈俊杰
commited on
Commit
•
c3f31ee
1
Parent(s):
9a263a1
fontSize
Browse files
app.py
CHANGED
@@ -126,6 +126,7 @@ st.markdown("""
|
|
126 |
.main-text {
|
127 |
font-size: 18px;
|
128 |
line-height: 1.6;
|
|
|
129 |
}
|
130 |
</style>
|
131 |
""", unsafe_allow_html=True)
|
@@ -142,8 +143,8 @@ elif page == "Methodology":
|
|
142 |
st.image("asserts/method.svg", use_column_width=True)
|
143 |
st.markdown("""
|
144 |
<ol class='main-text'>
|
145 |
-
<li>First, we choose four subtasks as shown in the table below:</li>
|
146 |
-
<table>
|
147 |
<thead>
|
148 |
<tr>
|
149 |
<th style="text-align: left">Task</th>
|
@@ -174,9 +175,9 @@ elif page == "Methodology":
|
|
174 |
</tr>
|
175 |
</tbody>
|
176 |
</table>
|
177 |
-
<li>Second, we choose a series of popular LLMs during the competition to generate answers.</li>
|
178 |
-
<li>Third, we manually annotate the answer sets for each question, which will be used as gold standards for evaluating the performance of different evaluation methods.</li>
|
179 |
-
<li>Last, we will collect evaluation results from participants and calculate consistency with manually annotated results. We will use Accuracy, Kendall’s tau and Spearman correlation coefficient as the evaluation metrics.</li>
|
180 |
</ol>
|
181 |
""",unsafe_allow_html=True)
|
182 |
|
@@ -196,39 +197,31 @@ elif page == "Datasets":
|
|
196 |
elif page == "Important Dates":
|
197 |
st.header("Important Dates")
|
198 |
st.markdown("""
|
199 |
-
<p class='main-text'><em>All deadlines are at 11:59pm in the Anywhere on Earth (AOE) timezone.</em><br />
|
200 |
-
<span class=
|
201 |
-
<span class=
|
202 |
-
<span class=
|
203 |
-
<span class=
|
204 |
-
<span class=
|
205 |
-
<span class=
|
206 |
-
<span class=
|
207 |
-
<span class=
|
208 |
""",unsafe_allow_html=True)
|
209 |
elif page == "Evaluation Measures":
|
210 |
st.header("Evaluation Measures")
|
211 |
st.markdown("""
|
212 |
-
<
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
$$
|
217 |
-
\\tau=\\frac{C-D}{\\frac{1}{2}n(n-1)}
|
218 |
-
$$
|
219 |
-
|
220 |
-
where:
|
221 |
-
- C is the number of concordant pairs,
|
222 |
-
- D is the number of discordant pairs,
|
223 |
-
- n is the number of pairs.
|
224 |
-
- **Spearman's Rank Correlation Coefficient:** Measures the strength and direction of the association between two ranked variables.
|
225 |
-
$$
|
226 |
-
\\rho = 1 - \\frac{6 \sum d_i^2}{n(n^2 - 1)}
|
227 |
-
$$
|
228 |
where:
|
229 |
-
|
230 |
-
|
231 |
-
|
|
|
|
|
|
|
|
|
|
|
232 |
""",unsafe_allow_html=True)
|
233 |
elif page == "Data and File format":
|
234 |
st.header("Data and File format")
|
@@ -254,10 +247,12 @@ elif page == "LeaderBoard":
|
|
254 |
st.markdown("""
|
255 |
<div class='main-text'>
|
256 |
This leaderboard is used to show the performance of the **automatic evaluation methods of LLMs** submitted by the **AEOLLM team** on four tasks:
|
257 |
-
-
|
258 |
-
|
259 |
-
|
260 |
-
|
|
|
|
|
261 |
</div>
|
262 |
""", unsafe_allow_html=True)
|
263 |
# 创建示例数据
|
@@ -309,19 +304,19 @@ This leaderboard is used to show the performance of the **automatic evaluation m
|
|
309 |
tab1, tab2, tab3, tab4 = st.tabs(["DG", "TE", "SG", "NFQA"])
|
310 |
|
311 |
with tab1:
|
312 |
-
st.markdown("""Task: Dialogue Generation; Dataset: DialyDialog""", unsafe_allow_html=True)
|
313 |
st.dataframe(df1, use_container_width=True)
|
314 |
|
315 |
with tab2:
|
316 |
-
st.markdown("""Task: Text Expansion; Dataset: WritingPrompts""", unsafe_allow_html=True)
|
317 |
st.dataframe(df2, use_container_width=True)
|
318 |
|
319 |
with tab3:
|
320 |
-
st.markdown("""Task: Summary Generation; Dataset: Xsum""", unsafe_allow_html=True)
|
321 |
st.dataframe(df3, use_container_width=True)
|
322 |
|
323 |
with tab4:
|
324 |
-
st.markdown("""Task: Non-Factoid QA; Dataset: NF_CATS""", unsafe_allow_html=True)
|
325 |
st.dataframe(df4, use_container_width=True)
|
326 |
elif page == "Organisers":
|
327 |
st.header("Organisers")
|
|
|
126 |
.main-text {
|
127 |
font-size: 18px;
|
128 |
line-height: 1.6;
|
129 |
+
color: #4CAF50;
|
130 |
}
|
131 |
</style>
|
132 |
""", unsafe_allow_html=True)
|
|
|
143 |
st.image("asserts/method.svg", use_column_width=True)
|
144 |
st.markdown("""
|
145 |
<ol class='main-text'>
|
146 |
+
<li class='main-text'>First, we choose four subtasks as shown in the table below:</li>
|
147 |
+
<table class='main-text'>
|
148 |
<thead>
|
149 |
<tr>
|
150 |
<th style="text-align: left">Task</th>
|
|
|
175 |
</tr>
|
176 |
</tbody>
|
177 |
</table>
|
178 |
+
<li class='main-text'>Second, we choose a series of popular LLMs during the competition to generate answers.</li>
|
179 |
+
<li class='main-text'>Third, we manually annotate the answer sets for each question, which will be used as gold standards for evaluating the performance of different evaluation methods.</li>
|
180 |
+
<li class='main-text'>Last, we will collect evaluation results from participants and calculate consistency with manually annotated results. We will use Accuracy, Kendall’s tau and Spearman correlation coefficient as the evaluation metrics.</li>
|
181 |
</ol>
|
182 |
""",unsafe_allow_html=True)
|
183 |
|
|
|
197 |
elif page == "Important Dates":
|
198 |
st.header("Important Dates")
|
199 |
st.markdown("""
|
200 |
+
<p class='main-text'><em class='main-text>All deadlines are at 11:59pm in the Anywhere on Earth (AOE) timezone.</em><br />
|
201 |
+
<span class='main-text'><strong>Kickoff Event</strong>:</span> <span class='main-text'>March 29, 2024</span><br />
|
202 |
+
<span class='main-text'><strong>Dataset Release</strong>:</span> <span class='main-text'>👉May 1, 2024</span><br />
|
203 |
+
<span class='main-text'><strong>System Output Submission Deadline</strong>:</span> <span class='main-text'>Jan 15, 2025</span><br />
|
204 |
+
<span class='main-text'><strong>Evaluation Results Release</strong>:</span> <span class='main-text'>Feb 1, 2025</span> <br />
|
205 |
+
<span class='main-text'><strong>Task overview release (draft)</strong>:</span> <span class='main-text'>Feb 1, 2025</span><br />
|
206 |
+
<span class='main-text'><strong>Submission Due of Participant Papers (draft)</strong>:</span> <span class='main-text'>March 1, 2025</span><br />
|
207 |
+
<span class='main-text'><strong>Camera-Ready Participant Paper Due</strong>:</span> <span class='main-text'>May 1, 2025</span><br />
|
208 |
+
<span class='main-text'><strong>NTCIR-18 Conference</strong>:</span> <span class='main-text'>Jun 10-13 2025</span><br /></p>
|
209 |
""",unsafe_allow_html=True)
|
210 |
elif page == "Evaluation Measures":
|
211 |
st.header("Evaluation Measures")
|
212 |
st.markdown("""
|
213 |
+
<ul class='main-text'>
|
214 |
+
<li><strong>Acc(Accuracy): </strong>The proportion of identical preference results between the model and human annotations. Specifically, we first convert individual scores (ranks) into pairwise preferences and then calculate consistency with human annotations.</li>
|
215 |
+
<li><strong>Kendall's tau: </strong>Measures the ordinal association between two ranked variables. $$\tau = \frac{C-D}{\frac{1}{2}n(n-1)}$$
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
216 |
where:
|
217 |
+
C is the number of concordant pairs,
|
218 |
+
D is the number of discordant pairs,
|
219 |
+
n is the number of pairs.</li>
|
220 |
+
<li><strong>Spearman's Rank Correlation Coefficient: </strong>Measures the strength and direction of the association between two ranked variables. $$\rho = 1 - \frac{6 \sum d_i^2}{n(n^2 - 1)}$$
|
221 |
+
where:
|
222 |
+
\(d_i\) is the difference between the ranks of corresponding elements in the two lists,
|
223 |
+
n is the number of elements.</li>
|
224 |
+
</ul>
|
225 |
""",unsafe_allow_html=True)
|
226 |
elif page == "Data and File format":
|
227 |
st.header("Data and File format")
|
|
|
247 |
st.markdown("""
|
248 |
<div class='main-text'>
|
249 |
This leaderboard is used to show the performance of the **automatic evaluation methods of LLMs** submitted by the **AEOLLM team** on four tasks:
|
250 |
+
<ul class='main-text'>
|
251 |
+
<li>Dialogue Generation (DG)</li>
|
252 |
+
<li>Text Expansion (TE)</li>
|
253 |
+
<li>Summary Generation (SG)</li>
|
254 |
+
<li>Non-Factoid QA (NFQA)</li>
|
255 |
+
</ul>
|
256 |
</div>
|
257 |
""", unsafe_allow_html=True)
|
258 |
# 创建示例数据
|
|
|
304 |
tab1, tab2, tab3, tab4 = st.tabs(["DG", "TE", "SG", "NFQA"])
|
305 |
|
306 |
with tab1:
|
307 |
+
st.markdown("""<div class='main-text'>Task: Dialogue Generation; Dataset: DialyDialog</div>""", unsafe_allow_html=True)
|
308 |
st.dataframe(df1, use_container_width=True)
|
309 |
|
310 |
with tab2:
|
311 |
+
st.markdown("""<div class='main-text'>Task: Text Expansion; Dataset: WritingPrompts</div>""", unsafe_allow_html=True)
|
312 |
st.dataframe(df2, use_container_width=True)
|
313 |
|
314 |
with tab3:
|
315 |
+
st.markdown("""<div class='main-text'>Task: Summary Generation; Dataset: Xsum</div>""", unsafe_allow_html=True)
|
316 |
st.dataframe(df3, use_container_width=True)
|
317 |
|
318 |
with tab4:
|
319 |
+
st.markdown("""<div class='main-text'>Task: Non-Factoid QA; Dataset: NF_CATS</div>""", unsafe_allow_html=True)
|
320 |
st.dataframe(df4, use_container_width=True)
|
321 |
elif page == "Organisers":
|
322 |
st.header("Organisers")
|