Spaces:

THUIR
/

AEOLLM

Running

AEOLLM / app.py

陈俊杰

cjj: updateRank

eaa1d85 4 months ago

2.75 kB

	import streamlit as st
	import pandas as pd

	# CSS样式
	st.markdown("""
	<style>
	h1 {
	font-size: 2.5em; /* 标题字体大小 */
	}
	.stDataFrame {
	font-family: Helvetica;
	}
	.dataframe th, .dataframe td {
	width: auto;
	min-width: 500px;
	}
	</style>
	""", unsafe_allow_html=True)

	# 标题
	st.title('🏆AEOLLM Leaderboard')

	# 描述
	st.markdown("""
	This leaderboard is used to show the performance of the automatic evaluation methods of LLMs submitted by the AEOLLM team on four tasks:
	- Dialogue Generation (DG)
	- Text Expansion (TE)
	- Summary Generation (SG)
	- Non-Factoid QA (NFQA)

	Details of AEOLLLM can be found at the link: [https://aeollm.github.io/](https://aeollm.github.io/)
	""", unsafe_allow_html=True)
	# 创建示例数据

	DG = {
	"methods": ["chatglm3-6b", "baichuan2-13b", "chatglm-pro", "gpt-4o-mini"],
	"team": ["baseline", "baseline", "baseline", "baseline"],
	"accuracy": [0.5806, 0.5483, 0.6001, 0.6472],
	"kendall's tau": [0.3243, 0.1739, 0.3042, 0.4167],
	"spearman": [0.3505, 0.1857, 0.3264, 0.4512]
	}
	df1 = pd.DataFrame(DG)

	TE = {
	"methods": ["chatglm3-6b", "baichuan2-13b", "chatglm-pro", "gpt-4o-mini"],
	"team": ["baseline", "baseline", "baseline", "baseline"],
	"accuracy": [0.5107, 0.5050, 0.5461, 0.5581],
	"kendall's tau": [0.1281, 0.0635, 0.2716, 0.3864],
	"spearman": [0.1352, 0.0667, 0.2867, 0.4157]
	}
	df2 = pd.DataFrame(TE)

	SG = {
	"methods": ["chatglm3-6b", "baichuan2-13b", "chatglm-pro", "gpt-4o-mini"],
	"team": ["baseline", "baseline", "baseline", "baseline"],
	"accuracy": [0.6504, 0.6014, 0.7162, 0.7441],
	"kendall's tau": [0.3957, 0.2688, 0.5092, 0.5001],
	"spearman": [0.4188, 0.2817, 0.5403, 0.5405],
	}
	df3 = pd.DataFrame(SG)

	NFQA = {
	"methods": ["chatglm3-6b", "baichuan2-13b", "chatglm-pro", "gpt-4o-mini"],
	"team": ["baseline", "baseline", "baseline", "baseline"],
	"accuracy": [0.5935, 0.5817, 0.7000, 0.7203],
	"kendall's tau": [0.2332, 0.2389, 0.4440, 0.4235],
	"spearman": [0.2443, 0.2492, 0.4630, 0.4511]
	}
	df4 = pd.DataFrame(NFQA)

	# 创建标签页
	tab1, tab2, tab3, tab4 = st.tabs(["DG", "TE", "SG", "NFQA"])

	# 在标签页 3 中添加内容
	with tab1:
	st.header("Task: Dialogue Generation; Dataset: DialyDialog")
	st.dataframe(df1, use_container_width=True)

	# 在标签页 4 中添加内容
	with tab2:
	st.header("Task: Text Expansion; Dataset: WritingPrompts")
	st.dataframe(df2, use_container_width=True)

	with tab3:
	st.header("Task: Summary Generation; Dataset: Xsum")
	st.dataframe(df3, use_container_width=True)

	# 在标签页 2 中添加内容
	with tab4:
	st.header("Task: Non-Factoid QA; Dataset: NF_CATS")
	st.dataframe(df4, use_container_width=True)