|
import streamlit as st |
|
import pandas as pd |
|
|
|
|
|
st.markdown(""" |
|
<style> |
|
h1 { |
|
font-size: 2.5em; /* 标题字体大小 */ |
|
} |
|
.stDataFrame { |
|
font-family: Helvetica; |
|
} |
|
.dataframe th, .dataframe td { |
|
width: auto; |
|
min-width: 500px; |
|
} |
|
</style> |
|
""", unsafe_allow_html=True) |
|
|
|
|
|
st.title('🏆AEOLLM Leaderboard') |
|
|
|
|
|
st.markdown(""" |
|
This leaderboard is used to show the performance of the **automatic evaluation methods of LLMs** submitted by the **AEOLLM team** on four tasks: |
|
- Dialogue Generation (DG) |
|
- Text Expansion (TE) |
|
- Summary Generation (SG) |
|
- Non-Factoid QA (NFQA) |
|
|
|
Details of AEOLLLM can be found at the link: [https://aeollm.github.io/](https://aeollm.github.io/) |
|
""", unsafe_allow_html=True) |
|
|
|
|
|
DG = { |
|
"methods": ["chatglm3-6b", "baichuan2-13b", "chatglm-pro", "gpt-4o-mini"], |
|
"team": ["baseline", "baseline", "baseline", "baseline"], |
|
"accuracy": [0.5806, 0.5483, 0.6001, 0.6472], |
|
"kendall's tau": [0.3243, 0.1739, 0.3042, 0.4167], |
|
"spearman": [0.3505, 0.1857, 0.3264, 0.4512] |
|
} |
|
df1 = pd.DataFrame(DG) |
|
|
|
TE = { |
|
"methods": ["chatglm3-6b", "baichuan2-13b", "chatglm-pro", "gpt-4o-mini"], |
|
"team": ["baseline", "baseline", "baseline", "baseline"], |
|
"accuracy": [0.5107, 0.5050, 0.5461, 0.5581], |
|
"kendall's tau": [0.1281, 0.0635, 0.2716, 0.3864], |
|
"spearman": [0.1352, 0.0667, 0.2867, 0.4157] |
|
} |
|
df2 = pd.DataFrame(TE) |
|
|
|
SG = { |
|
"methods": ["chatglm3-6b", "baichuan2-13b", "chatglm-pro", "gpt-4o-mini"], |
|
"team": ["baseline", "baseline", "baseline", "baseline"], |
|
"accuracy": [0.6504, 0.6014, 0.7162, 0.7441], |
|
"kendall's tau": [0.3957, 0.2688, 0.5092, 0.5001], |
|
"spearman": [0.4188, 0.2817, 0.5403, 0.5405], |
|
} |
|
df3 = pd.DataFrame(SG) |
|
|
|
NFQA = { |
|
"methods": ["chatglm3-6b", "baichuan2-13b", "chatglm-pro", "gpt-4o-mini"], |
|
"team": ["baseline", "baseline", "baseline", "baseline"], |
|
"accuracy": [0.5935, 0.5817, 0.7000, 0.7203], |
|
"kendall's tau": [0.2332, 0.2389, 0.4440, 0.4235], |
|
"spearman": [0.2443, 0.2492, 0.4630, 0.4511] |
|
} |
|
df4 = pd.DataFrame(NFQA) |
|
|
|
|
|
tab1, tab2, tab3, tab4 = st.tabs(["DG", "TE", "SG", "NFQA"]) |
|
|
|
|
|
with tab1: |
|
st.header("Task: Dialogue Generation; Dataset: DialyDialog") |
|
st.dataframe(df1, use_container_width=True) |
|
|
|
|
|
with tab2: |
|
st.header("Task: Text Expansion; Dataset: WritingPrompts") |
|
st.dataframe(df2, use_container_width=True) |
|
|
|
with tab3: |
|
st.header("Task: Summary Generation; Dataset: Xsum") |
|
st.dataframe(df3, use_container_width=True) |
|
|
|
|
|
with tab4: |
|
st.header("Task: Non-Factoid QA; Dataset: NF_CATS") |
|
st.dataframe(df4, use_container_width=True) |
|
|