File size: 2,747 Bytes
7f6ca6e eaa1d85 7f6ca6e 4d2570c 7f6ca6e eaa1d85 7f6ca6e eaa1d85 7f6ca6e eaa1d85 7f6ca6e eaa1d85 7f6ca6e eaa1d85 7f6ca6e eaa1d85 7f6ca6e eaa1d85 7f6ca6e eaa1d85 7f6ca6e eaa1d85 7f6ca6e eaa1d85 7f6ca6e eaa1d85 7f6ca6e eaa1d85 7f6ca6e eaa1d85 7f6ca6e eaa1d85 7f6ca6e eaa1d85 7f6ca6e eaa1d85 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 |
import streamlit as st
import pandas as pd
# CSS样式
st.markdown("""
<style>
h1 {
font-size: 2.5em; /* 标题字体大小 */
}
.stDataFrame {
font-family: Helvetica;
}
.dataframe th, .dataframe td {
width: auto;
min-width: 500px;
}
</style>
""", unsafe_allow_html=True)
# 标题
st.title('🏆AEOLLM Leaderboard')
# 描述
st.markdown("""
This leaderboard is used to show the performance of the **automatic evaluation methods of LLMs** submitted by the **AEOLLM team** on four tasks:
- Dialogue Generation (DG)
- Text Expansion (TE)
- Summary Generation (SG)
- Non-Factoid QA (NFQA)
Details of AEOLLLM can be found at the link: [https://aeollm.github.io/](https://aeollm.github.io/)
""", unsafe_allow_html=True)
# 创建示例数据
DG = {
"methods": ["chatglm3-6b", "baichuan2-13b", "chatglm-pro", "gpt-4o-mini"],
"team": ["baseline", "baseline", "baseline", "baseline"],
"accuracy": [0.5806, 0.5483, 0.6001, 0.6472],
"kendall's tau": [0.3243, 0.1739, 0.3042, 0.4167],
"spearman": [0.3505, 0.1857, 0.3264, 0.4512]
}
df1 = pd.DataFrame(DG)
TE = {
"methods": ["chatglm3-6b", "baichuan2-13b", "chatglm-pro", "gpt-4o-mini"],
"team": ["baseline", "baseline", "baseline", "baseline"],
"accuracy": [0.5107, 0.5050, 0.5461, 0.5581],
"kendall's tau": [0.1281, 0.0635, 0.2716, 0.3864],
"spearman": [0.1352, 0.0667, 0.2867, 0.4157]
}
df2 = pd.DataFrame(TE)
SG = {
"methods": ["chatglm3-6b", "baichuan2-13b", "chatglm-pro", "gpt-4o-mini"],
"team": ["baseline", "baseline", "baseline", "baseline"],
"accuracy": [0.6504, 0.6014, 0.7162, 0.7441],
"kendall's tau": [0.3957, 0.2688, 0.5092, 0.5001],
"spearman": [0.4188, 0.2817, 0.5403, 0.5405],
}
df3 = pd.DataFrame(SG)
NFQA = {
"methods": ["chatglm3-6b", "baichuan2-13b", "chatglm-pro", "gpt-4o-mini"],
"team": ["baseline", "baseline", "baseline", "baseline"],
"accuracy": [0.5935, 0.5817, 0.7000, 0.7203],
"kendall's tau": [0.2332, 0.2389, 0.4440, 0.4235],
"spearman": [0.2443, 0.2492, 0.4630, 0.4511]
}
df4 = pd.DataFrame(NFQA)
# 创建标签页
tab1, tab2, tab3, tab4 = st.tabs(["DG", "TE", "SG", "NFQA"])
# 在标签页 3 中添加内容
with tab1:
st.header("Task: Dialogue Generation; Dataset: DialyDialog")
st.dataframe(df1, use_container_width=True)
# 在标签页 4 中添加内容
with tab2:
st.header("Task: Text Expansion; Dataset: WritingPrompts")
st.dataframe(df2, use_container_width=True)
with tab3:
st.header("Task: Summary Generation; Dataset: Xsum")
st.dataframe(df3, use_container_width=True)
# 在标签页 2 中添加内容
with tab4:
st.header("Task: Non-Factoid QA; Dataset: NF_CATS")
st.dataframe(df4, use_container_width=True)
|