File size: 5,298 Bytes
670a324
1b6f3b3
670a324
 
1b6f3b3
 
670a324
 
46b2948
 
670a324
 
 
0098fa3
670a324
 
 
46b2948
 
670a324
 
 
 
 
0098fa3
670a324
 
 
46b2948
 
670a324
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cc4119e
670a324
e000c61
670a324
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2f1ff79
 
 
 
670a324
c878c43
 
ada4b04
 
 
c878c43
 
ada4b04
 
 
c878c43
 
ada4b04
 
 
c878c43
670a324
 
 
 
 
2f1ff79
670a324
 
 
 
 
 
1b6f3b3
670a324
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1b6f3b3
 
670a324
1b6f3b3
670a324
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
# matplotlib.use('macosx')
import gradio as gr
import matplotlib
import numpy as np
import pandas as pd
from apscheduler.schedulers.background import BackgroundScheduler

TASK1_COLS = [
    ("Team", "str"),
    ("Email", "str"),
    ("Acc", "number"),
    ("F1", "number"),
    ("MCC", "number"),
    ("DLT", "number"),
]

TASK2_COLS = [
    ("Team", "str"),
    ("Email", "str"),
    ("Rouge-1", "number"),
    ("Rouge-2", "number"),
    ("Rouge-L", "number"),
    ("BertScore", "number"),
    ("BartScore", "number"),
    ("DLT", "number"),
]

TASK3_COLS = [
    ("Team", "str"),
    ("Email", "str"),
    ("Sharpe Ratio", "number"),
    ("Sharpe Ratio - DRIV", "number"),
    ("Sharpe Ratio - FORM", "number"),
    ("Sharpe Ratio - JNJ", "number"),
    ("Sharpe Ratio - MSFT", "number"),
]


# Extract column names
task1_cols = [col_name for col_name, _ in TASK1_COLS]
task2_cols = [col_name for col_name, _ in TASK2_COLS]
task3_cols = [col_name for col_name, _ in TASK3_COLS]


def create_df_dict(lang, lang_cols):
    # Load leaderboard data with column names
    leaderboard_df = pd.read_csv(f"{lang}_result.csv", names=lang_cols)
    # leaderboard_df = leaderboard_df.sort_index(axis=1)
    # Move 'key' column to the front
    leaderboard_df = leaderboard_df[["Team"] + [col for col in leaderboard_df.columns if col != "Team"]]
    cols = leaderboard_df.columns
    types = ["str"] + ["number"] * (len(lang_cols) - 1)

    # Split merged_df into subtask dataframes
    df_dict = {"overall": leaderboard_df}
    return df_dict


df_lang = {
    "Task 1": create_df_dict("task1", task1_cols),
    "Task 2": create_df_dict("task2", task2_cols),
    "Task 3": create_df_dict("task3", task3_cols),
}


# Constants
TITLE = '<h1 align="center" id="space-title">🐲 IJCAI 2024 FinLLM Challenge Leaderboard</h1>'
INTRODUCTION_TEXT = """πŸ“Š Introduction

The FinLLM Challenge rigorously evaluates state-of-the-art models in financial text analysis, generation, and decision-making tasks. These tasks include financial classification, financial text summarization, and single stock trading.

πŸ“ˆ Unique Evaluation Metrics

Our leaderboard incorporates a comprehensive evaluation using diverse metrics like Accuracy, F1 Score, ROUGE, BERTScore, and Sharpe Ratio to assess the models' capabilities in real-world financial applications.

πŸ“š Task Details

**Task 1: Financial Classification**

- **Objective:** Classify sentences as claims or premises.
- **Dataset:** 7.75k training data, 969 test data.
- **Evaluation Metrics:** F1 Score (final ranking metric) and Accuracy.

**Task 2: Financial Text Summarization**

- **Objective:** Summarize financial news articles into concise texts.
- **Dataset:** 8k training data, 2k test data.
- **Evaluation Metrics:** ROUGE (1, 2, L) and BERTScore (ROUGE-1 as the final ranking metric).

**Task 3: Single Stock Trading**

- **Objective:** Make stock trading decisions (buy, sell, hold) with reasonings.
- **Dataset:** 291 data points.
- **Evaluation Metrics:** Sharpe Ratio (final ranking metric), Cumulative Return, Daily and Annualized Volatility, Maximum Drawdown.

**Model Cheating Detection: Data Leakage Test (DLT)**

To measure the risk of data leakage from the test set used in training, we introduce the Data Leakage Test (DLT). The DLT calculates the difference in perplexity between the training set and the test set. A larger difference indicates a lower likelihood of model cheating, while a smaller difference suggests a higher likelihood.

For more details, refer to our [Challenge page](https://sites.google.com/nlg.csie.ntu.edu.tw/finnlp-agentscen/shared-task-finllm?authuser=0).

**Task 1: Top 3**
πŸ₯‡ Team Barclays
πŸ₯ˆ Albatross
πŸ₯‰ L3iTC

**Task 2: Top 3**
πŸ₯‡ LBZ
πŸ₯ˆ NP
πŸ₯‰ Finance Wizard

**Task 3: Top 3**
πŸ₯‡ Wealth Guide
πŸ₯ˆ NP
πŸ₯‰ Albatross

"""


def create_data_interface(df):
    headers = df.columns
    print (headers)
    types = ["str"] + ["number"] * (len(headers) - 1)

    return gr.components.Dataframe(
        value=df.values.tolist(),
        headers=[col_name for col_name in headers],
        datatype=types,
    )


def plot_radar_chart(df, attributes, category_name):
    fig = go.Figure()

    for index, row in df.iterrows():
        model = row["Model"]
        values = row[attributes].tolist()
        fig.add_trace(go.Scatterpolar(r=values, theta=attributes, fill="toself", name=model))

    fig.update_layout(title="FLARE", polar=dict(radialaxis=dict(visible=True, range=[0, 0.9])), showlegend=True)

    return fig


def create_data_interface_for_aggregated(df, category_name):
    attributes = df.columns[1:]
    print(attributes)
    plt = plot_radar_chart(df, attributes, category_name)
    return plt


def create_lang_leaderboard(df_dict):
    for key, df in df_dict.items():
        with gr.Tab(key):
            create_data_interface(df)


def launch_gradio():
    demo = gr.Blocks()

    with demo:
        gr.HTML(TITLE)
        gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
        for key, df_dict in df_lang.items():
            with gr.Tab(key):
                create_lang_leaderboard(df_dict)

    demo.launch()


scheduler = BackgroundScheduler()
scheduler.add_job(launch_gradio, "interval", seconds=3600)
scheduler.start()

# Launch immediately
launch_gradio()