Spaces:

THUIR
/

AEOLLM

Running

File size: 2,747 Bytes

7f6ca6e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eaa1d85
 
7f6ca6e
 
 
4d2570c
7f6ca6e
 
eaa1d85
 
 
 
 
 
 
7f6ca6e
eaa1d85
7f6ca6e
eaa1d85
 
 
 
 
 
7f6ca6e
eaa1d85
7f6ca6e
eaa1d85
 
 
 
 
 
7f6ca6e
eaa1d85
7f6ca6e
eaa1d85
 
 
 
 
 
7f6ca6e
eaa1d85
7f6ca6e
 
eaa1d85
7f6ca6e
eaa1d85
7f6ca6e
eaa1d85
7f6ca6e
 
eaa1d85
7f6ca6e
eaa1d85
7f6ca6e
 
 
eaa1d85
7f6ca6e
 
eaa1d85
7f6ca6e
eaa1d85

import streamlit as st
import pandas as pd

# CSS样式
st.markdown("""
<style>
h1 {
    font-size: 2.5em;  /* 标题字体大小 */
}
.stDataFrame {
    font-family: Helvetica;
}
.dataframe th, .dataframe td {
    width: auto;
    min-width: 500px; 
}
</style>
""", unsafe_allow_html=True)

# 标题
st.title('🏆AEOLLM Leaderboard')

# 描述
st.markdown("""
This leaderboard is used to show the performance of the **automatic evaluation methods of LLMs** submitted by the **AEOLLM team** on four tasks:
- Dialogue Generation (DG)
- Text Expansion (TE)
- Summary Generation (SG)
- Non-Factoid QA (NFQA)
            
Details of AEOLLLM can be found at the link: [https://aeollm.github.io/](https://aeollm.github.io/)
""", unsafe_allow_html=True)
# 创建示例数据

DG = {
    "methods": ["chatglm3-6b", "baichuan2-13b", "chatglm-pro", "gpt-4o-mini"],
    "team": ["baseline", "baseline", "baseline", "baseline"],
    "accuracy": [0.5806, 0.5483, 0.6001, 0.6472],
    "kendall's tau": [0.3243, 0.1739, 0.3042, 0.4167],
    "spearman": [0.3505, 0.1857, 0.3264, 0.4512]
}
df1 = pd.DataFrame(DG)

TE = {
    "methods": ["chatglm3-6b", "baichuan2-13b", "chatglm-pro", "gpt-4o-mini"],
    "team": ["baseline", "baseline", "baseline", "baseline"],
    "accuracy": [0.5107, 0.5050, 0.5461, 0.5581],
    "kendall's tau": [0.1281, 0.0635, 0.2716, 0.3864],
    "spearman": [0.1352, 0.0667, 0.2867, 0.4157]
}
df2 = pd.DataFrame(TE)

SG = {
    "methods": ["chatglm3-6b", "baichuan2-13b", "chatglm-pro", "gpt-4o-mini"],
    "team": ["baseline", "baseline", "baseline", "baseline"],
    "accuracy": [0.6504, 0.6014, 0.7162, 0.7441],
    "kendall's tau": [0.3957, 0.2688, 0.5092, 0.5001],
    "spearman": [0.4188, 0.2817, 0.5403, 0.5405],
}
df3 = pd.DataFrame(SG)

NFQA = {
    "methods": ["chatglm3-6b", "baichuan2-13b", "chatglm-pro", "gpt-4o-mini"],
    "team": ["baseline", "baseline", "baseline", "baseline"], 
    "accuracy": [0.5935, 0.5817, 0.7000, 0.7203],
    "kendall's tau": [0.2332, 0.2389, 0.4440, 0.4235],
    "spearman": [0.2443, 0.2492, 0.4630, 0.4511]
}
df4 = pd.DataFrame(NFQA)

# 创建标签页
tab1, tab2, tab3, tab4 = st.tabs(["DG", "TE", "SG", "NFQA"])

# 在标签页 3 中添加内容
with tab1:
    st.header("Task: Dialogue Generation; Dataset: DialyDialog")
    st.dataframe(df1, use_container_width=True)

# 在标签页 4 中添加内容
with tab2:
    st.header("Task: Text Expansion; Dataset: WritingPrompts")
    st.dataframe(df2, use_container_width=True)

with tab3:
    st.header("Task: Summary Generation; Dataset: Xsum")
    st.dataframe(df3, use_container_width=True)

# 在标签页 2 中添加内容
with tab4:
    st.header("Task: Non-Factoid QA; Dataset: NF_CATS")
    st.dataframe(df4, use_container_width=True)