Spaces:

THUIR
/

AEOLLM

Running

App Files Files Community

陈俊杰 commited on Aug 29, 2024

Commit

eaa1d85

•

1 Parent(s): 4d2570c

cjj: updateRank

Browse files

Files changed (1) hide show

app.py +40 -40

app.py CHANGED Viewed

@@ -23,69 +23,69 @@ st.title('🏆AEOLLM Leaderboard')
 # 描述
 st.markdown("""
 This leaderboard is used to show the performance of the **automatic evaluation methods of LLMs** submitted by the **AEOLLM team** on four tasks:
 - Summary Generation (SG)
 - Non-Factoid QA (NFQA)
-- Dialogue Generation (DG)
-- Text Expansion (TE).
 Details of AEOLLLM can be found at the link: [https://aeollm.github.io/](https://aeollm.github.io/)
 """, unsafe_allow_html=True)
 # 创建示例数据
-SG = {
-    "methods": ["Model A", "Model B", "Model C"],
-    "team": ["U1", "U2", "U3"],
-    "acc": [0.75, 0.64, 0.83],
-    "tau": [0.05, 0.28, 0.16],
-    "s": [0.12, 0.27, 0.18],
 }
-df1 = pd.DataFrame(SG)
-NFQA = {
-    "methods": ["Model A", "Model B", "Model C"],
-    "team": ["U1", "U2", "U3"],
-    "acc": [0.75, 0.64, 0.83],
-    "tau": [0.05, 0.28, 0.16],
-    "s": [0.12, 0.27, 0.18]
 }
-df2 = pd.DataFrame(NFQA)
-DG = {
-    "methods": ["Model A", "Model B", "Model C"],
-    "team": ["U1", "U2", "U3"],
-    "acc": [0.75, 0.64, 0.83],
-    "tau": [0.05, 0.28, 0.16],
-    "s": [0.12, 0.27, 0.18]
 }
-df3 = pd.DataFrame(DG)
-TE = {
-    "methods": ["Model A", "Model B", "Model C"],
-    "team": ["U1", "U2", "U3"],
-    "acc": [0.75, 0.64, 0.83],
-    "tau": [0.05, 0.28, 0.16],
-    "s": [0.12, 0.27, 0.18]
 }
-df4 = pd.DataFrame(TE)
 # 创建标签页
-tab1, tab2, tab3, tab4 = st.tabs(["SG", "NFQA", "DG", "TE"])
-# 在标签页 1 中添加内容
 with tab1:
-    st.header("Summary Generation")
     st.dataframe(df1, use_container_width=True)
-# 在标签页 2 中添加内容
 with tab2:
-    st.header("Non-Factoid QA")
     st.dataframe(df2, use_container_width=True)
-# 在标签页 3 中添加内容
 with tab3:
-    st.header("Dialogue Generation")
     st.dataframe(df3, use_container_width=True)
-# 在标签页 4 中添加内容
 with tab4:
-    st.header("Text Expansion")
-    st.dataframe(df4, use_container_width=True, )

 # 描述
 st.markdown("""
 This leaderboard is used to show the performance of the **automatic evaluation methods of LLMs** submitted by the **AEOLLM team** on four tasks:
+- Dialogue Generation (DG)
+- Text Expansion (TE)
 - Summary Generation (SG)
 - Non-Factoid QA (NFQA)
 Details of AEOLLLM can be found at the link: [https://aeollm.github.io/](https://aeollm.github.io/)
 """, unsafe_allow_html=True)
 # 创建示例数据
+DG = {
+    "methods": ["chatglm3-6b", "baichuan2-13b", "chatglm-pro", "gpt-4o-mini"],
+    "team": ["baseline", "baseline", "baseline", "baseline"],
+    "accuracy": [0.5806, 0.5483, 0.6001, 0.6472],
+    "kendall's tau": [0.3243, 0.1739, 0.3042, 0.4167],
+    "spearman": [0.3505, 0.1857, 0.3264, 0.4512]
 }
+df1 = pd.DataFrame(DG)
+TE = {
+    "methods": ["chatglm3-6b", "baichuan2-13b", "chatglm-pro", "gpt-4o-mini"],
+    "team": ["baseline", "baseline", "baseline", "baseline"],
+    "accuracy": [0.5107, 0.5050, 0.5461, 0.5581],
+    "kendall's tau": [0.1281, 0.0635, 0.2716, 0.3864],
+    "spearman": [0.1352, 0.0667, 0.2867, 0.4157]
 }
+df2 = pd.DataFrame(TE)
+SG = {
+    "methods": ["chatglm3-6b", "baichuan2-13b", "chatglm-pro", "gpt-4o-mini"],
+    "team": ["baseline", "baseline", "baseline", "baseline"],
+    "accuracy": [0.6504, 0.6014, 0.7162, 0.7441],
+    "kendall's tau": [0.3957, 0.2688, 0.5092, 0.5001],
+    "spearman": [0.4188, 0.2817, 0.5403, 0.5405],
 }
+df3 = pd.DataFrame(SG)
+NFQA = {
+    "methods": ["chatglm3-6b", "baichuan2-13b", "chatglm-pro", "gpt-4o-mini"],
+    "team": ["baseline", "baseline", "baseline", "baseline"],
+    "accuracy": [0.5935, 0.5817, 0.7000, 0.7203],
+    "kendall's tau": [0.2332, 0.2389, 0.4440, 0.4235],
+    "spearman": [0.2443, 0.2492, 0.4630, 0.4511]
 }
+df4 = pd.DataFrame(NFQA)
 # 创建标签页
+tab1, tab2, tab3, tab4 = st.tabs(["DG", "TE", "SG", "NFQA"])
+# 在标签页 3 中添加内容
 with tab1:
+    st.header("Task: Dialogue Generation; Dataset: DialyDialog")
     st.dataframe(df1, use_container_width=True)
+# 在标签页 4 中添加内容
 with tab2:
+    st.header("Task: Text Expansion; Dataset: WritingPrompts")
     st.dataframe(df2, use_container_width=True)
 with tab3:
+    st.header("Task: Summary Generation; Dataset: Xsum")
     st.dataframe(df3, use_container_width=True)
+# 在标签页 2 中添加内容
 with tab4:
+    st.header("Task: Non-Factoid QA; Dataset: NF_CATS")
+    st.dataframe(df4, use_container_width=True)