File size: 5,298 Bytes
670a324 1b6f3b3 670a324 1b6f3b3 670a324 46b2948 670a324 0098fa3 670a324 46b2948 670a324 0098fa3 670a324 46b2948 670a324 cc4119e 670a324 e000c61 670a324 2f1ff79 670a324 c878c43 ada4b04 c878c43 ada4b04 c878c43 ada4b04 c878c43 670a324 2f1ff79 670a324 1b6f3b3 670a324 1b6f3b3 670a324 1b6f3b3 670a324 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 |
# matplotlib.use('macosx')
import gradio as gr
import matplotlib
import numpy as np
import pandas as pd
from apscheduler.schedulers.background import BackgroundScheduler
TASK1_COLS = [
("Team", "str"),
("Email", "str"),
("Acc", "number"),
("F1", "number"),
("MCC", "number"),
("DLT", "number"),
]
TASK2_COLS = [
("Team", "str"),
("Email", "str"),
("Rouge-1", "number"),
("Rouge-2", "number"),
("Rouge-L", "number"),
("BertScore", "number"),
("BartScore", "number"),
("DLT", "number"),
]
TASK3_COLS = [
("Team", "str"),
("Email", "str"),
("Sharpe Ratio", "number"),
("Sharpe Ratio - DRIV", "number"),
("Sharpe Ratio - FORM", "number"),
("Sharpe Ratio - JNJ", "number"),
("Sharpe Ratio - MSFT", "number"),
]
# Extract column names
task1_cols = [col_name for col_name, _ in TASK1_COLS]
task2_cols = [col_name for col_name, _ in TASK2_COLS]
task3_cols = [col_name for col_name, _ in TASK3_COLS]
def create_df_dict(lang, lang_cols):
# Load leaderboard data with column names
leaderboard_df = pd.read_csv(f"{lang}_result.csv", names=lang_cols)
# leaderboard_df = leaderboard_df.sort_index(axis=1)
# Move 'key' column to the front
leaderboard_df = leaderboard_df[["Team"] + [col for col in leaderboard_df.columns if col != "Team"]]
cols = leaderboard_df.columns
types = ["str"] + ["number"] * (len(lang_cols) - 1)
# Split merged_df into subtask dataframes
df_dict = {"overall": leaderboard_df}
return df_dict
df_lang = {
"Task 1": create_df_dict("task1", task1_cols),
"Task 2": create_df_dict("task2", task2_cols),
"Task 3": create_df_dict("task3", task3_cols),
}
# Constants
TITLE = '<h1 align="center" id="space-title">π² IJCAI 2024 FinLLM Challenge Leaderboard</h1>'
INTRODUCTION_TEXT = """π Introduction
The FinLLM Challenge rigorously evaluates state-of-the-art models in financial text analysis, generation, and decision-making tasks. These tasks include financial classification, financial text summarization, and single stock trading.
π Unique Evaluation Metrics
Our leaderboard incorporates a comprehensive evaluation using diverse metrics like Accuracy, F1 Score, ROUGE, BERTScore, and Sharpe Ratio to assess the models' capabilities in real-world financial applications.
π Task Details
**Task 1: Financial Classification**
- **Objective:** Classify sentences as claims or premises.
- **Dataset:** 7.75k training data, 969 test data.
- **Evaluation Metrics:** F1 Score (final ranking metric) and Accuracy.
**Task 2: Financial Text Summarization**
- **Objective:** Summarize financial news articles into concise texts.
- **Dataset:** 8k training data, 2k test data.
- **Evaluation Metrics:** ROUGE (1, 2, L) and BERTScore (ROUGE-1 as the final ranking metric).
**Task 3: Single Stock Trading**
- **Objective:** Make stock trading decisions (buy, sell, hold) with reasonings.
- **Dataset:** 291 data points.
- **Evaluation Metrics:** Sharpe Ratio (final ranking metric), Cumulative Return, Daily and Annualized Volatility, Maximum Drawdown.
**Model Cheating Detection: Data Leakage Test (DLT)**
To measure the risk of data leakage from the test set used in training, we introduce the Data Leakage Test (DLT). The DLT calculates the difference in perplexity between the training set and the test set. A larger difference indicates a lower likelihood of model cheating, while a smaller difference suggests a higher likelihood.
For more details, refer to our [Challenge page](https://sites.google.com/nlg.csie.ntu.edu.tw/finnlp-agentscen/shared-task-finllm?authuser=0).
**Task 1: Top 3**
π₯ Team Barclays
π₯ Albatross
π₯ L3iTC
**Task 2: Top 3**
π₯ LBZ
π₯ NP
π₯ Finance Wizard
**Task 3: Top 3**
π₯ Wealth Guide
π₯ NP
π₯ Albatross
"""
def create_data_interface(df):
headers = df.columns
print (headers)
types = ["str"] + ["number"] * (len(headers) - 1)
return gr.components.Dataframe(
value=df.values.tolist(),
headers=[col_name for col_name in headers],
datatype=types,
)
def plot_radar_chart(df, attributes, category_name):
fig = go.Figure()
for index, row in df.iterrows():
model = row["Model"]
values = row[attributes].tolist()
fig.add_trace(go.Scatterpolar(r=values, theta=attributes, fill="toself", name=model))
fig.update_layout(title="FLARE", polar=dict(radialaxis=dict(visible=True, range=[0, 0.9])), showlegend=True)
return fig
def create_data_interface_for_aggregated(df, category_name):
attributes = df.columns[1:]
print(attributes)
plt = plot_radar_chart(df, attributes, category_name)
return plt
def create_lang_leaderboard(df_dict):
for key, df in df_dict.items():
with gr.Tab(key):
create_data_interface(df)
def launch_gradio():
demo = gr.Blocks()
with demo:
gr.HTML(TITLE)
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
for key, df_dict in df_lang.items():
with gr.Tab(key):
create_lang_leaderboard(df_dict)
demo.launch()
scheduler = BackgroundScheduler()
scheduler.add_job(launch_gradio, "interval", seconds=3600)
scheduler.start()
# Launch immediately
launch_gradio()
|