AEOLLM / app.py
陈俊杰
cjj: updateRank
eaa1d85
raw
history blame
2.75 kB
import streamlit as st
import pandas as pd
# CSS样式
st.markdown("""
<style>
h1 {
font-size: 2.5em; /* 标题字体大小 */
}
.stDataFrame {
font-family: Helvetica;
}
.dataframe th, .dataframe td {
width: auto;
min-width: 500px;
}
</style>
""", unsafe_allow_html=True)
# 标题
st.title('🏆AEOLLM Leaderboard')
# 描述
st.markdown("""
This leaderboard is used to show the performance of the **automatic evaluation methods of LLMs** submitted by the **AEOLLM team** on four tasks:
- Dialogue Generation (DG)
- Text Expansion (TE)
- Summary Generation (SG)
- Non-Factoid QA (NFQA)
Details of AEOLLLM can be found at the link: [https://aeollm.github.io/](https://aeollm.github.io/)
""", unsafe_allow_html=True)
# 创建示例数据
DG = {
"methods": ["chatglm3-6b", "baichuan2-13b", "chatglm-pro", "gpt-4o-mini"],
"team": ["baseline", "baseline", "baseline", "baseline"],
"accuracy": [0.5806, 0.5483, 0.6001, 0.6472],
"kendall's tau": [0.3243, 0.1739, 0.3042, 0.4167],
"spearman": [0.3505, 0.1857, 0.3264, 0.4512]
}
df1 = pd.DataFrame(DG)
TE = {
"methods": ["chatglm3-6b", "baichuan2-13b", "chatglm-pro", "gpt-4o-mini"],
"team": ["baseline", "baseline", "baseline", "baseline"],
"accuracy": [0.5107, 0.5050, 0.5461, 0.5581],
"kendall's tau": [0.1281, 0.0635, 0.2716, 0.3864],
"spearman": [0.1352, 0.0667, 0.2867, 0.4157]
}
df2 = pd.DataFrame(TE)
SG = {
"methods": ["chatglm3-6b", "baichuan2-13b", "chatglm-pro", "gpt-4o-mini"],
"team": ["baseline", "baseline", "baseline", "baseline"],
"accuracy": [0.6504, 0.6014, 0.7162, 0.7441],
"kendall's tau": [0.3957, 0.2688, 0.5092, 0.5001],
"spearman": [0.4188, 0.2817, 0.5403, 0.5405],
}
df3 = pd.DataFrame(SG)
NFQA = {
"methods": ["chatglm3-6b", "baichuan2-13b", "chatglm-pro", "gpt-4o-mini"],
"team": ["baseline", "baseline", "baseline", "baseline"],
"accuracy": [0.5935, 0.5817, 0.7000, 0.7203],
"kendall's tau": [0.2332, 0.2389, 0.4440, 0.4235],
"spearman": [0.2443, 0.2492, 0.4630, 0.4511]
}
df4 = pd.DataFrame(NFQA)
# 创建标签页
tab1, tab2, tab3, tab4 = st.tabs(["DG", "TE", "SG", "NFQA"])
# 在标签页 3 中添加内容
with tab1:
st.header("Task: Dialogue Generation; Dataset: DialyDialog")
st.dataframe(df1, use_container_width=True)
# 在标签页 4 中添加内容
with tab2:
st.header("Task: Text Expansion; Dataset: WritingPrompts")
st.dataframe(df2, use_container_width=True)
with tab3:
st.header("Task: Summary Generation; Dataset: Xsum")
st.dataframe(df3, use_container_width=True)
# 在标签页 2 中添加内容
with tab4:
st.header("Task: Non-Factoid QA; Dataset: NF_CATS")
st.dataframe(df4, use_container_width=True)