mirageco commited on
Commit
e9d718d
1 Parent(s): 9d3d5c0

Add small descriptions of each of the datasets

Browse files
Files changed (2) hide show
  1. src/about.py +39 -39
  2. src/leaderboard/read_evals.py +6 -4
src/about.py CHANGED
@@ -100,45 +100,45 @@ If the icon is "?", it indicates that there is insufficient information about th
100
 
101
  Our evaluation metrics include, but are not limited to, Accuracy, F1 Score, ROUGE score, BERTScore, and Matthews correlation coefficient (MCC), providing a multidimensional assessment of model performance. Metrics for specific tasks are as follows:
102
 
103
- - **FPB**: F1
104
- - **FiQA-SA**: F1
105
- - **TSA**: RMSE
106
- - **Headlines**: AvgF1
107
- - **FOMC**: F1
108
- - **FinArg-ACC**: MicroF1
109
- - **FinArg-ARC**: MicroF1
110
- - **Multifin**: MicroF1
111
- - **MA**: MicroF1
112
- - **MLESG**: MicroF1
113
- - **NER**: EntityF1
114
- - **FINER-ORD**: EntityF1
115
- - **FinRED**: F1
116
- - **SC**: F1
117
- - **CD**: F1
118
- - **FinQA**: EmAcc
119
- - **TATQA**: EmAcc
120
- - **ConvFinQA**: EmAcc
121
- - **FNXL**: EntityF1
122
- - **FSRL**: EntityF1
123
- - **EDTSUM**: Rouge-1
124
- - **ECTSUM**: Rouge-1
125
- - **BigData22**: Acc
126
- - **ACL18**: Acc
127
- - **CIKM18**: Acc
128
- - **German**: MCC
129
- - **Australian**: MCC
130
- - **LendingClub**: MCC
131
- - **ccf**: MCC
132
- - **ccfraud**: MCC
133
- - **polish**: MCC
134
- - **taiwan**: MCC
135
- - **portoseguro**: MCC
136
- - **travelinsurance**: MCC
137
- - **MultiFin-ES**: F1
138
- - **EFP**: F1
139
- - **EFPA**: F1
140
- - **FinanceES**: F1
141
- - **TSA-Spanish**: F1
142
 
143
 
144
  To ensure a fair and unbiased assessment of the models' true capabilities, all evaluations are conducted in zero-shot settings (0-shots). This approach eliminates any potential advantage from task-specific fine-tuning, providing a clear indication of how well the models can generalize to new tasks.
 
100
 
101
  Our evaluation metrics include, but are not limited to, Accuracy, F1 Score, ROUGE score, BERTScore, and Matthews correlation coefficient (MCC), providing a multidimensional assessment of model performance. Metrics for specific tasks are as follows:
102
 
103
+ - **FPB**: F1, Accuracy. Financial PhraseBank classification task.
104
+ - **FiQA-SA**: F1. Sentiment analysis on FiQA financial domain.
105
+ - **TSA**: F1, Accuracy. Sentiment analysis.
106
+ - **Headlines**: AvgF1. News headline classification.
107
+ - **FOMC**: F1, Accuracy. Hawkish-dovish classification.
108
+ - **FinArg-ACC**: F1, Accuracy. Financial argument unit classification.
109
+ - **FinArg-ARC**: F1, Accuracy. Financial argument relation classification.
110
+ - **MultiFin**: F1, Accuracy. Multi-class financial sentiment analysis.
111
+ - **MA**: F1, Accuracy. Deal completeness classification.
112
+ - **MLESG**: F1, Accuracy. ESG issue identification.
113
+ - **NER**: EntityF1. Named entity recognition in financial texts.
114
+ - **FINER-ORD**: EntityF1. Ordinal classification in financial NER.
115
+ - **FinRED**: F1, EntityF1. Financial relation extraction from text.
116
+ - **SC**: F1, EntityF1. Causal classification task in the financial domain.
117
+ - **CD**: F1, EntityF1. Causal detection.
118
+ - **FinQA**: EmAcc. Numerical question answering in finance.
119
+ - **TATQA**: F1, EmAcc. Table-based question answering in financial documents.
120
+ - **ConvFinQA**: EmAcc. Multi-turn question answering in finance.
121
+ - **FNXL**: F1, EmAcc. Numeric labeling in financial texts.
122
+ - **FSRL**: F1, EmAcc. Financial statement relation linking.
123
+ - **EDTSUM**: ROUGE, BERTScore, BARTScore. Extractive document summarization in finance.
124
+ - **ECTSUM**: ROUGE, BERTScore, BARTScore. Extractive content summarization.
125
+ - **BigData22**: Accuracy, MCC. Stock movement prediction.
126
+ - **ACL18**: Accuracy, MCC. Financial news-based stock prediction.
127
+ - **CIKM18**: Accuracy, MCC. Financial market prediction using news.
128
+ - **German**: F1, MCC. Credit scoring in the German market.
129
+ - **Australian**: F1, MCC. Credit scoring in the Australian market.
130
+ - **LendingClub**: F1, MCC. Peer-to-peer lending risk prediction.
131
+ - **ccf**: F1, MCC. Credit card fraud detection.
132
+ - **ccfraud**: F1, MCC. Credit card transaction fraud detection.
133
+ - **polish**: F1, MCC. Credit risk prediction in the Polish market.
134
+ - **taiwan**: F1, MCC. Credit risk prediction in the Taiwanese market.
135
+ - **portoseguro**: F1, MCC. Claim analysis in the Brazilian market.
136
+ - **travelinsurance**: F1, MCC. Travel insurance claim prediction.
137
+ - **MultiFin-ES**: F1. Multi-class financial sentiment analysis in Spanish.
138
+ - **EFP**: F1. Financial phrase classification in Spanish.
139
+ - **EFPA**: F1. Financial argument classification in Spanish.
140
+ - **FinanceES**: F1. Financial sentiment classification in Spanish.
141
+ - **TSA-Spanish**: F1. Sentiment analysis in Spanish.
142
 
143
 
144
  To ensure a fair and unbiased assessment of the models' true capabilities, all evaluations are conducted in zero-shot settings (0-shots). This approach eliminates any potential advantage from task-specific fine-tuning, providing a clear indication of how well the models can generalize to new tasks.
src/leaderboard/read_evals.py CHANGED
@@ -11,6 +11,7 @@ from src.display.formatting import make_clickable_model
11
  from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
12
  from src.submission.check_validity import is_model_on_hub
13
 
 
14
 
15
  @dataclass
16
  class EvalResult:
@@ -38,8 +39,6 @@ class EvalResult:
38
  with open(json_filepath) as fp:
39
  data = json.load(fp)
40
 
41
- print(f"Processing file: {json_filepath}")
42
-
43
  config = data.get("config")
44
  # Precision
45
  precision = Precision.from_str(config.get("model_dtype"))
@@ -83,7 +82,11 @@ class EvalResult:
83
  mean_acc = np.mean(accs) * 100.0
84
  results[task.benchmark] = mean_acc
85
 
86
- print(f"Model: {model}, Org: {org}, Results: {results.keys()}")
 
 
 
 
87
 
88
  return self(
89
  eval_name=result_key,
@@ -102,7 +105,6 @@ class EvalResult:
102
  def update_with_request_file(self, requests_path):
103
  """Finds the relevant request file for the current model and updates info with it"""
104
  request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
105
-
106
  try:
107
  with open(request_file, "r") as f:
108
  request = json.load(f)
 
11
  from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
12
  from src.submission.check_validity import is_model_on_hub
13
 
14
+ task_benchmarks = {task.value.benchmark for task in Tasks}
15
 
16
  @dataclass
17
  class EvalResult:
 
39
  with open(json_filepath) as fp:
40
  data = json.load(fp)
41
 
 
 
42
  config = data.get("config")
43
  # Precision
44
  precision = Precision.from_str(config.get("model_dtype"))
 
82
  mean_acc = np.mean(accs) * 100.0
83
  results[task.benchmark] = mean_acc
84
 
85
+ # Print missing benchmarks if any
86
+ missing_benchmarks = task_benchmarks - results.keys()
87
+ if missing_benchmarks:
88
+ print(f"(Missing results) Model {model} is missing {', '.join(missing_benchmarks)} from result files")
89
+
90
 
91
  return self(
92
  eval_name=result_key,
 
105
  def update_with_request_file(self, requests_path):
106
  """Finds the relevant request file for the current model and updates info with it"""
107
  request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
 
108
  try:
109
  with open(request_file, "r") as f:
110
  request = json.load(f)