File size: 2,199 Bytes
73a9525
 
 
1edab07
 
 
1aaa274
 
ca872a1
1edab07
4c24185
73a9525
1aaa274
c58f8c1
 
ed56ab7
1edab07
 
ed56ab7
4c24185
1aaa274
ed56ab7
4c24185
 
d6e3121
4c24185
 
 
 
c58f8c1
 
4c24185
1edab07
4c24185
 
 
 
 
c58f8c1
4c24185
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c58f8c1
4c24185
 
 
 
c58f8c1
4c24185
 
 
 
c58f8c1
4c24185
 
 
 
c58f8c1
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import streamlit as st
import pandas as pd

# CSS样式
st.markdown("""
<style>
h1 {
    font-size: 2.5em;  /* 标题字体大小 */
}
.stDataFrame {
    font-family: Helvetica;
}
.dataframe th, .dataframe td {
    width: auto;
    min-width: 500px; 
}
</style>
""", unsafe_allow_html=True)

# 标题
st.title('🏆AEOLLM Leaderboard')

# 描述
st.markdown("""
This leaderboard is used to show the performance of the **automatic evaluation methods of LLMs** submitted by the **AEOLLM team** on four tasks:
- Summary Generation (SG)
- Non-Factoid QA (NFQA)
- Dialogue Generation (DG)
- Text Expansion (TE).
            
Details of AEOLLLM can be found at the link: [https://cjj826.github.io/AEOLLM/](https://cjj826.github.io/AEOLLM/)
""", unsafe_allow_html=True)
# 创建示例数据
SG = {
    "methods": ["Model A", "Model B", "Model C"],
    "team": ["U1", "U2", "U3"],
    "acc": [0.75, 0.64, 0.83],
    "tau": [0.05, 0.28, 0.16],
    "s": [0.12, 0.27, 0.18],
}
df1 = pd.DataFrame(SG)

NFQA = {
    "methods": ["Model A", "Model B", "Model C"],
    "team": ["U1", "U2", "U3"],
    "acc": [0.75, 0.64, 0.83],
    "tau": [0.05, 0.28, 0.16],
    "s": [0.12, 0.27, 0.18]
}
df2 = pd.DataFrame(NFQA)

DG = {
    "methods": ["Model A", "Model B", "Model C"],
    "team": ["U1", "U2", "U3"],
    "acc": [0.75, 0.64, 0.83],
    "tau": [0.05, 0.28, 0.16],
    "s": [0.12, 0.27, 0.18]
}
df3 = pd.DataFrame(DG)

TE = {
    "methods": ["Model A", "Model B", "Model C"],
    "team": ["U1", "U2", "U3"],
    "acc": [0.75, 0.64, 0.83],
    "tau": [0.05, 0.28, 0.16],
    "s": [0.12, 0.27, 0.18]
}
df4 = pd.DataFrame(TE)

# 创建标签页
tab1, tab2, tab3, tab4 = st.tabs(["SG", "NFQA", "DG", "TE"])

# 在标签页 1 中添加内容
with tab1:
    st.header("Summary Generation")
    st.dataframe(df1, use_container_width=True)

# 在标签页 2 中添加内容
with tab2:
    st.header("Non-Factoid QA")
    st.dataframe(df2, use_container_width=True)

# 在标签页 3 中添加内容
with tab3:
    st.header("Dialogue Generation")
    st.dataframe(df3, use_container_width=True)

# 在标签页 4 中添加内容
with tab4:
    st.header("Text Expansion")
    st.dataframe(df4, use_container_width=True, )