djstrong commited on
Commit
b521fe9
1 Parent(s): 11f89d3

sort results by date

Browse files
Files changed (2) hide show
  1. src/about.py +23 -22
  2. src/leaderboard/read_evals.py +68 -37
src/about.py CHANGED
@@ -7,35 +7,36 @@ class Task:
7
  metric: str
8
  col_name: str
9
  type: str
 
10
 
11
 
12
  # Select your tasks here
13
  # ---------------------------------------------------
14
  class Tasks(Enum):
15
  # task_key in the json file, metric_key in the json file, name to display in the leaderboard
16
- # task2 = Task("belebele_pol_Latn", "acc,none", "belebele_pol_Latn", "multiple_choice")
17
- task3 = Task("polemo2_in", "exact_match,score-first", "polemo2-in_g", "generate_until")
18
- task4 = Task("polemo2_in_multiple_choice", "acc,none", "polemo2-in_mc", "multiple_choice")
19
- task5 = Task("polemo2_out", "exact_match,score-first", "polemo2-out_g", "generate_until")
20
- task6 = Task("polemo2_out_multiple_choice", "acc,none", "polemo2-out_mc", "multiple_choice")
21
- task7 = Task("polish_8tags_multiple_choice", "acc,none", "8tags_mc", "multiple_choice")
22
- task8 = Task("polish_8tags_regex", "exact_match,score-first", "8tags_g", "generate_until")
23
- task9a = Task("polish_belebele_mc", "acc,none", "belebele_mc", "multiple_choice")
24
- task9 = Task("polish_belebele_regex", "exact_match,score-first", "belebele_g", "generate_until")
25
- task10 = Task("polish_dyk_multiple_choice", "f1,none", "dyk_mc", "multiple_choice")
26
- task11 = Task("polish_dyk_regex", "f1,score-first", "dyk_g", "generate_until")
27
- task12 = Task("polish_ppc_multiple_choice", "acc,none", "ppc_mc", "multiple_choice")
28
- task13 = Task("polish_ppc_regex", "exact_match,score-first", "ppc_g", "generate_until")
29
- task14 = Task("polish_psc_multiple_choice", "f1,none", "psc_mc", "multiple_choice")
30
- task15 = Task("polish_psc_regex", "f1,score-first", "psc_g", "generate_until")
31
- task16 = Task("polish_cbd_multiple_choice", "f1,none", "cbd_mc", "multiple_choice")
32
- task17 = Task("polish_cbd_regex", "f1,score-first", "cbd_g", "generate_until")
33
- task18 = Task("polish_klej_ner_multiple_choice", "acc,none", "klej_ner_mc", "multiple_choice")
34
- task19 = Task("polish_klej_ner_regex", "exact_match,score-first", "klej_ner_g", "generate_until")
35
  task20 = Task("polish_poleval2018_task3_test_10k", "word_perplexity,none", "poleval2018_task3_test_10k", "other")
36
- task21 = Task("polish_polqa_reranking_multiple_choice", "acc,none", "polqa_reranking_mc", "other") # multiple_choice
37
- task22 = Task("polish_polqa_open_book", "levenshtein,none", "polqa_open_book_g", "other") # generate_until
38
- task23 = Task("polish_polqa_closed_book", "levenshtein,none", "polqa_closed_book_g", "other") # generate_until
39
 
40
  NUM_FEWSHOT = 0 # Change with your few shot
41
  # ---------------------------------------------------
 
7
  metric: str
8
  col_name: str
9
  type: str
10
+ baseline: float = 0.0
11
 
12
 
13
  # Select your tasks here
14
  # ---------------------------------------------------
15
  class Tasks(Enum):
16
  # task_key in the json file, metric_key in the json file, name to display in the leaderboard
17
+ # task2 = Task("belebele_pol_Latn", "acc,none", "belebele_pol_Latn", "multiple_choice", 0.279)
18
+ task3 = Task("polemo2_in", "exact_match,score-first", "polemo2-in_g", "generate_until", 0.416)
19
+ task4 = Task("polemo2_in_multiple_choice", "acc,none", "polemo2-in_mc", "multiple_choice", 0.416)
20
+ task5 = Task("polemo2_out", "exact_match,score-first", "polemo2-out_g", "generate_until", 0.368)
21
+ task6 = Task("polemo2_out_multiple_choice", "acc,none", "polemo2-out_mc", "multiple_choice", 0.368)
22
+ task7 = Task("polish_8tags_multiple_choice", "acc,none", "8tags_mc", "multiple_choice", 0.143)
23
+ task8 = Task("polish_8tags_regex", "exact_match,score-first", "8tags_g", "generate_until", 0.143)
24
+ task9a = Task("polish_belebele_mc", "acc,none", "belebele_mc", "multiple_choice", 0.279)
25
+ task9 = Task("polish_belebele_regex", "exact_match,score-first", "belebele_g", "generate_until", 0.279)
26
+ task10 = Task("polish_dyk_multiple_choice", "f1,none", "dyk_mc", "multiple_choice", 0.289)
27
+ task11 = Task("polish_dyk_regex", "f1,score-first", "dyk_g", "generate_until", 0.289)
28
+ task12 = Task("polish_ppc_multiple_choice", "acc,none", "ppc_mc", "multiple_choice", 0.419)
29
+ task13 = Task("polish_ppc_regex", "exact_match,score-first", "ppc_g", "generate_until", 0.419)
30
+ task14 = Task("polish_psc_multiple_choice", "f1,none", "psc_mc", "multiple_choice", 0.466)
31
+ task15 = Task("polish_psc_regex", "f1,score-first", "psc_g", "generate_until", 0.466)
32
+ task16 = Task("polish_cbd_multiple_choice", "f1,none", "cbd_mc", "multiple_choice", 0.149)
33
+ task17 = Task("polish_cbd_regex", "f1,score-first", "cbd_g", "generate_until", 0.149)
34
+ task18 = Task("polish_klej_ner_multiple_choice", "acc,none", "klej_ner_mc", "multiple_choice", 0.343)
35
+ task19 = Task("polish_klej_ner_regex", "exact_match,score-first", "klej_ner_g", "generate_until", 0.343)
36
  task20 = Task("polish_poleval2018_task3_test_10k", "word_perplexity,none", "poleval2018_task3_test_10k", "other")
37
+ # task21 = Task("polish_polqa_reranking_multiple_choice", "acc,none", "polqa_reranking_mc", "multiple_choice", 0.5335588952710677) # multiple_choice
38
+ # task22 = Task("polish_polqa_open_book", "levenshtein,none", "polqa_open_book_g", "generate_until", 0.0) # generate_until
39
+ # task23 = Task("polish_polqa_closed_book", "levenshtein,none", "polqa_closed_book_g", "generate_until", 0.0) # generate_until
40
 
41
  NUM_FEWSHOT = 0 # Change with your few shot
42
  # ---------------------------------------------------
src/leaderboard/read_evals.py CHANGED
@@ -14,26 +14,28 @@ from src.submission.check_validity import is_model_on_hub
14
 
15
  NUM_FEWSHOT = 0
16
 
 
17
  @dataclass
18
  class EvalResult:
19
- eval_name: str # org_model_precision (uid)
20
- full_model: str # org/model (path on hub)
21
- org: str
22
  model: str
23
- revision: str # commit hash, "" if main
24
  results: dict
25
  precision: Precision = Precision.Unknown
26
- model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
27
- weight_type: WeightType = WeightType.Original # Original or Adapter
28
- architecture: str = "Unknown"
29
  license: str = "?"
30
  lang: str = "?"
31
  likes: int = 0
32
  num_params: int = 0
33
- date: str = "" # submission date of request file
34
  still_on_hub: bool = False
35
  n_shot: NShotType = NShotType.n0
36
  org_and_model: str = ""
 
37
 
38
  @classmethod
39
  def init_from_json_file(self, json_filepath, n_shot_num):
@@ -43,6 +45,7 @@ class EvalResult:
43
 
44
  config = data.get("config")
45
  n_shot = data.get("n-shot")
 
46
 
47
  # Precision
48
  precision = Precision.from_str(config.get("model_dtype"))
@@ -54,14 +57,17 @@ class EvalResult:
54
 
55
  if re.match(r"^pretrained=(.*/(plgkwrobel|plggspkl)/)(models/)?", org_and_model):
56
  org_and_model = re.sub(r"^pretrained=(.*/(plgkwrobel|plggspkl)/)(models/)?", SPICHLERZ_ORG, org_and_model)
57
- org_and_model = org_and_model.replace(",dtype=bfloat16", "")
58
 
59
- org_and_model=org_and_model.replace("models/hf_v7_e1", "APT3-1B-Instruct-e1")
60
- org_and_model=org_and_model.replace("models/hf_v7_e2", "APT3-1B-Instruct-e2")
 
 
 
61
 
62
  org_and_model = re.sub(r"^pretrained=", "", org_and_model)
63
  org_and_model = org_and_model.replace(",trust_remote_code=True", "")
64
  org_and_model = re.sub(",prefix_token_id=\d+", "", org_and_model)
 
65
 
66
  org_and_model = org_and_model.split("/", 1)
67
 
@@ -90,7 +96,8 @@ class EvalResult:
90
  task = task.value
91
 
92
  # We average all scores of a given metric (not all metrics are present in all files)
93
- accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k and n_shot.get(k, -1) == n_shot_num])
 
94
  if accs.size == 0 or any([acc is None for acc in accs]):
95
  continue
96
 
@@ -98,7 +105,8 @@ class EvalResult:
98
  mean_acc = np.mean(accs)
99
  else:
100
  mean_acc = np.mean(accs) * 100.0
101
- results[task.benchmark] = mean_acc
 
102
 
103
  return self(
104
  eval_name=result_key,
@@ -106,27 +114,27 @@ class EvalResult:
106
  org=org,
107
  model=model,
108
  results=results,
109
- precision=precision,
110
- revision= config.get("model_sha", ""),
111
  still_on_hub=still_on_hub,
112
  architecture=architecture,
113
  n_shot=NShotType.from_str(n_shot_num),
114
- org_and_model=orig_org_and_model
 
115
  )
116
 
117
  def update_with_metadata(self, metadata):
118
- #print('UPDATE', self.full_model, self.model, self.eval_name)
119
  try:
120
- meta=metadata[self.full_model]
121
  self.model_type = ModelType.from_str(meta.get("type", "?"))
122
  self.num_params = meta.get("params", 0)
123
  self.license = meta.get("license", "?")
124
  self.lang = meta.get("lang", "?")
125
- #TODO desc name
126
  except KeyError:
127
  print(f"Could not find metadata for {self.full_model}")
128
 
129
-
130
  def update_with_request_file(self, requests_path):
131
  """Finds the relevant request file for the current model and updates info with it"""
132
  return
@@ -149,12 +157,18 @@ class EvalResult:
149
  g_tasks = [task.value.benchmark for task in Tasks if task.value.type == "generate_until"]
150
  mc_tasks = [task.value.benchmark for task in Tasks if task.value.type == "multiple_choice"]
151
  all_tasks = g_tasks + mc_tasks
152
- average = sum([v for task,v in self.results.items() if v is not None and task in all_tasks]) / len(all_tasks)
153
- average_g = sum([v for task,v in self.results.items() if v is not None and task in g_tasks]) / len(g_tasks)
154
- average_mc = sum([v for task,v in self.results.items() if v is not None and task in mc_tasks]) / len(mc_tasks)
155
 
 
156
 
157
- data_dict={}
 
 
 
 
 
 
 
 
158
  # data_dict = {
159
  # "eval_name": self.eval_name, # not a column, just a save name,
160
  # AutoEvalColumn.precision.name: self.precision.value.name,
@@ -188,7 +202,6 @@ class EvalResult:
188
  except KeyError:
189
  print(f"Could not find model type")
190
 
191
-
192
  try:
193
  data_dict[AutoEvalColumn.model_type_symbol.name] = self.model_type.value.symbol
194
  except KeyError:
@@ -209,7 +222,8 @@ class EvalResult:
209
  print(f"AttributeError architecture")
210
 
211
  try:
212
- data_dict[AutoEvalColumn.model.name] = make_clickable_model(self.full_model) if self.still_on_hub else self.full_model
 
213
  except KeyError:
214
  print(f"Could not find model")
215
 
@@ -305,8 +319,8 @@ def get_request_file_for_model(requests_path, model_name, precision):
305
  with open(tmp_request_file, "r") as f:
306
  req_content = json.load(f)
307
  if (
308
- req_content["status"] in ["FINISHED"]
309
- and req_content["precision"] == precision.split(".")[-1]
310
  ):
311
  request_file = tmp_request_file
312
  return request_file
@@ -330,30 +344,48 @@ def get_raw_eval_results(results_path: str, requests_path: str, metadata) -> lis
330
  for file in files:
331
  model_result_filepaths.append(os.path.join(root, file))
332
 
 
 
333
  eval_results = {}
334
- for n_shot in [0,5]:
335
  for model_result_filepath in model_result_filepaths:
336
  # Creation of result
337
  eval_result = EvalResult.init_from_json_file(model_result_filepath, n_shot_num=n_shot)
338
  eval_result.update_with_request_file(requests_path)
339
- #update with metadata
340
  eval_result.update_with_metadata(metadata)
341
 
342
-
343
  # Store results of same eval together
344
  eval_name = f"{eval_result.eval_name}_{n_shot}-shot"
345
  if eval_name in eval_results.keys():
346
- eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
347
- #TODO: log updated
 
 
 
 
 
 
 
 
 
 
 
 
 
 
348
  else:
349
  eval_results[eval_name] = eval_result
350
 
 
 
 
351
  results = []
352
  for v in eval_results.values():
353
  try:
354
  print(v)
355
- v.to_dict() # we test if the dict version is complete
356
- #if v.results:
357
  results.append(v)
358
  except KeyError: # not all eval values present
359
  print(f"not all eval values present {v.eval_name} {v.full_model}")
@@ -370,7 +402,7 @@ def get_raw_eval_results(results_path: str, requests_path: str, metadata) -> lis
370
  missing_results_for_task[task_name].append(f"{v.full_model}|{v.org_and_model}")
371
  else:
372
  missing_results_for_task[task_name] = [f"{v.full_model}|{v.org_and_model}"]
373
- if r[AutoEvalColumn.lang.name] is None or r[AutoEvalColumn.lang.name]=="?":
374
  missing_metadata.append(f"{v.full_model}")
375
 
376
  # print('missing_results_for_task', missing_results_for_task)
@@ -386,5 +418,4 @@ def get_raw_eval_results(results_path: str, requests_path: str, metadata) -> lis
386
  print(model)
387
  print()
388
 
389
-
390
  return results
 
14
 
15
  NUM_FEWSHOT = 0
16
 
17
+
18
  @dataclass
19
  class EvalResult:
20
+ eval_name: str # org_model_precision (uid)
21
+ full_model: str # org/model (path on hub)
22
+ org: str
23
  model: str
24
+ revision: str # commit hash, "" if main
25
  results: dict
26
  precision: Precision = Precision.Unknown
27
+ model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
28
+ weight_type: WeightType = WeightType.Original # Original or Adapter
29
+ architecture: str = "Unknown"
30
  license: str = "?"
31
  lang: str = "?"
32
  likes: int = 0
33
  num_params: int = 0
34
+ date: str = "" # submission date of request file
35
  still_on_hub: bool = False
36
  n_shot: NShotType = NShotType.n0
37
  org_and_model: str = ""
38
+ start_date: float = 0
39
 
40
  @classmethod
41
  def init_from_json_file(self, json_filepath, n_shot_num):
 
45
 
46
  config = data.get("config")
47
  n_shot = data.get("n-shot")
48
+ start_date = data.get("date", 0)
49
 
50
  # Precision
51
  precision = Precision.from_str(config.get("model_dtype"))
 
57
 
58
  if re.match(r"^pretrained=(.*/(plgkwrobel|plggspkl)/)(models/)?", org_and_model):
59
  org_and_model = re.sub(r"^pretrained=(.*/(plgkwrobel|plggspkl)/)(models/)?", SPICHLERZ_ORG, org_and_model)
 
60
 
61
+ org_and_model = org_and_model.replace(",dtype=bfloat16", "")
62
+ org_and_model = org_and_model.replace(",dtype=float16", "")
63
+
64
+ org_and_model = org_and_model.replace("models/hf_v7_e1", "APT3-1B-Instruct-e1")
65
+ org_and_model = org_and_model.replace("models/hf_v7_e2", "APT3-1B-Instruct-e2")
66
 
67
  org_and_model = re.sub(r"^pretrained=", "", org_and_model)
68
  org_and_model = org_and_model.replace(",trust_remote_code=True", "")
69
  org_and_model = re.sub(",prefix_token_id=\d+", "", org_and_model)
70
+ org_and_model = re.sub("/$", "", org_and_model)
71
 
72
  org_and_model = org_and_model.split("/", 1)
73
 
 
96
  task = task.value
97
 
98
  # We average all scores of a given metric (not all metrics are present in all files)
99
+ accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if
100
+ task.benchmark == k and n_shot.get(k, -1) == n_shot_num])
101
  if accs.size == 0 or any([acc is None for acc in accs]):
102
  continue
103
 
 
105
  mean_acc = np.mean(accs)
106
  else:
107
  mean_acc = np.mean(accs) * 100.0
108
+ results[task.benchmark] = (mean_acc, start_date)
109
+ # results[task.benchmark] = mean_acc
110
 
111
  return self(
112
  eval_name=result_key,
 
114
  org=org,
115
  model=model,
116
  results=results,
117
+ precision=precision,
118
+ revision=config.get("model_sha", ""),
119
  still_on_hub=still_on_hub,
120
  architecture=architecture,
121
  n_shot=NShotType.from_str(n_shot_num),
122
+ org_and_model=orig_org_and_model,
123
+ start_date=start_date
124
  )
125
 
126
  def update_with_metadata(self, metadata):
127
+ # print('UPDATE', self.full_model, self.model, self.eval_name)
128
  try:
129
+ meta = metadata[self.full_model]
130
  self.model_type = ModelType.from_str(meta.get("type", "?"))
131
  self.num_params = meta.get("params", 0)
132
  self.license = meta.get("license", "?")
133
  self.lang = meta.get("lang", "?")
134
+ # TODO desc name
135
  except KeyError:
136
  print(f"Could not find metadata for {self.full_model}")
137
 
 
138
  def update_with_request_file(self, requests_path):
139
  """Finds the relevant request file for the current model and updates info with it"""
140
  return
 
157
  g_tasks = [task.value.benchmark for task in Tasks if task.value.type == "generate_until"]
158
  mc_tasks = [task.value.benchmark for task in Tasks if task.value.type == "multiple_choice"]
159
  all_tasks = g_tasks + mc_tasks
 
 
 
160
 
161
+ baselines = {task.value.benchmark: task.value.baseline*100 for task in Tasks}
162
 
163
+ average = sum([v for task, v in self.results.items() if v is not None and task in all_tasks]) / len(all_tasks)
164
+ average_g = sum([v for task, v in self.results.items() if v is not None and task in g_tasks]) / len(g_tasks)
165
+ average_mc = sum([v for task, v in self.results.items() if v is not None and task in mc_tasks]) / len(mc_tasks)
166
+
167
+ # average = sum([(v-baselines.get(task,0))/(100-baselines.get(task,0))*100 for task, v in self.results.items() if v is not None and task in all_tasks]) / len(all_tasks)
168
+ # average_g = sum([(v-baselines.get(task,0))/(100-baselines.get(task,0))*100 for task, v in self.results.items() if v is not None and task in g_tasks]) / len(g_tasks)
169
+ # average_mc = sum([(v-baselines.get(task,0))/(100-baselines.get(task,0))*100 for task, v in self.results.items() if v is not None and task in mc_tasks]) / len(mc_tasks)
170
+
171
+ data_dict = {}
172
  # data_dict = {
173
  # "eval_name": self.eval_name, # not a column, just a save name,
174
  # AutoEvalColumn.precision.name: self.precision.value.name,
 
202
  except KeyError:
203
  print(f"Could not find model type")
204
 
 
205
  try:
206
  data_dict[AutoEvalColumn.model_type_symbol.name] = self.model_type.value.symbol
207
  except KeyError:
 
222
  print(f"AttributeError architecture")
223
 
224
  try:
225
+ data_dict[AutoEvalColumn.model.name] = make_clickable_model(
226
+ self.full_model) if self.still_on_hub else self.full_model
227
  except KeyError:
228
  print(f"Could not find model")
229
 
 
319
  with open(tmp_request_file, "r") as f:
320
  req_content = json.load(f)
321
  if (
322
+ req_content["status"] in ["FINISHED"]
323
+ and req_content["precision"] == precision.split(".")[-1]
324
  ):
325
  request_file = tmp_request_file
326
  return request_file
 
344
  for file in files:
345
  model_result_filepaths.append(os.path.join(root, file))
346
 
347
+ # print('PATHS:', model_result_filepaths)
348
+
349
  eval_results = {}
350
+ for n_shot in [0, 5]:
351
  for model_result_filepath in model_result_filepaths:
352
  # Creation of result
353
  eval_result = EvalResult.init_from_json_file(model_result_filepath, n_shot_num=n_shot)
354
  eval_result.update_with_request_file(requests_path)
355
+ # update with metadata
356
  eval_result.update_with_metadata(metadata)
357
 
 
358
  # Store results of same eval together
359
  eval_name = f"{eval_result.eval_name}_{n_shot}-shot"
360
  if eval_name in eval_results.keys():
361
+
362
+ for k, (v, start_date) in eval_result.results.items():
363
+ if v is not None:
364
+ if k in eval_results[eval_name].results:
365
+ if start_date > eval_results[eval_name].results[k][1]:
366
+ print(
367
+ f"Overwriting {eval_name}.results {k} {eval_results[eval_name].results[k]} with {v}: {model_result_filepath} {n_shot} {eval_result.start_date} {eval_results[eval_name].start_date}")
368
+ eval_results[eval_name].results[k] = (v, start_date)
369
+ else:
370
+ print(
371
+ f"Skipping {eval_name} {eval_result.start_date} {eval_results[eval_name].start_date}: {model_result_filepath} {n_shot}")
372
+ else:
373
+ eval_results[eval_name].results[k] = (v, start_date)
374
+ # eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
375
+ # TODO: log updated
376
+
377
  else:
378
  eval_results[eval_name] = eval_result
379
 
380
+ for k,v in eval_results.items():
381
+ v.results = {k: v for k, (v, start_date) in v.results.items()}
382
+
383
  results = []
384
  for v in eval_results.values():
385
  try:
386
  print(v)
387
+ v.to_dict() # we test if the dict version is complete
388
+ # if v.results:
389
  results.append(v)
390
  except KeyError: # not all eval values present
391
  print(f"not all eval values present {v.eval_name} {v.full_model}")
 
402
  missing_results_for_task[task_name].append(f"{v.full_model}|{v.org_and_model}")
403
  else:
404
  missing_results_for_task[task_name] = [f"{v.full_model}|{v.org_and_model}"]
405
+ if r[AutoEvalColumn.lang.name] is None or r[AutoEvalColumn.lang.name] == "?":
406
  missing_metadata.append(f"{v.full_model}")
407
 
408
  # print('missing_results_for_task', missing_results_for_task)
 
418
  print(model)
419
  print()
420
 
 
421
  return results