Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
eduagarcia
commited on
Commit
β’
8aaf0e7
1
Parent(s):
0cc3edb
Add env variable SHOW_INCOMPLETE_EVALS and order evaluation queue by priority
Browse files- app.py +16 -3
- src/display/changelog.py +1 -1
- src/envs.py +2 -0
- src/leaderboard/read_evals.py +3 -3
- src/populate.py +19 -6
- tasks_config/pt_config.yaml +1 -0
app.py
CHANGED
@@ -30,7 +30,19 @@ from src.display.utils import (
|
|
30 |
WeightType,
|
31 |
Precision
|
32 |
)
|
33 |
-
from src.envs import
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
35 |
from src.submission.submit import add_new_eval
|
36 |
from src.scripts.update_all_request_files import update_dynamic_files
|
@@ -81,7 +93,8 @@ def init_space():
|
|
81 |
requests_path=EVAL_REQUESTS_PATH,
|
82 |
dynamic_path=DYNAMIC_INFO_FILE_PATH,
|
83 |
cols=COLS,
|
84 |
-
benchmark_cols=BENCHMARK_COLS
|
|
|
85 |
)
|
86 |
update_collections(original_df.copy())
|
87 |
leaderboard_df = original_df.copy()
|
@@ -93,7 +106,7 @@ def init_space():
|
|
93 |
running_eval_queue_df,
|
94 |
pending_eval_queue_df,
|
95 |
failed_eval_queue_df
|
96 |
-
) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
97 |
|
98 |
return leaderboard_df, original_df, plot_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df, failed_eval_queue_df
|
99 |
|
|
|
30 |
WeightType,
|
31 |
Precision
|
32 |
)
|
33 |
+
from src.envs import (
|
34 |
+
API,
|
35 |
+
EVAL_REQUESTS_PATH,
|
36 |
+
DYNAMIC_INFO_REPO,
|
37 |
+
DYNAMIC_INFO_FILE_PATH,
|
38 |
+
DYNAMIC_INFO_PATH,
|
39 |
+
EVAL_RESULTS_PATH,
|
40 |
+
H4_TOKEN, IS_PUBLIC,
|
41 |
+
QUEUE_REPO,
|
42 |
+
REPO_ID,
|
43 |
+
RESULTS_REPO,
|
44 |
+
SHOW_INCOMPLETE_EVALS
|
45 |
+
)
|
46 |
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
47 |
from src.submission.submit import add_new_eval
|
48 |
from src.scripts.update_all_request_files import update_dynamic_files
|
|
|
93 |
requests_path=EVAL_REQUESTS_PATH,
|
94 |
dynamic_path=DYNAMIC_INFO_FILE_PATH,
|
95 |
cols=COLS,
|
96 |
+
benchmark_cols=BENCHMARK_COLS,
|
97 |
+
show_incomplete=SHOW_INCOMPLETE_EVALS
|
98 |
)
|
99 |
update_collections(original_df.copy())
|
100 |
leaderboard_df = original_df.copy()
|
|
|
106 |
running_eval_queue_df,
|
107 |
pending_eval_queue_df,
|
108 |
failed_eval_queue_df
|
109 |
+
) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS, show_incomplete=SHOW_INCOMPLETE_EVALS)
|
110 |
|
111 |
return leaderboard_df, original_df, plot_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df, failed_eval_queue_df
|
112 |
|
src/display/changelog.py
CHANGED
@@ -2,7 +2,7 @@ CHANGELOG_TEXT = f"""
|
|
2 |
# Changes made to the leaderboard
|
3 |
|
4 |
### [1.1.0] - 2024-02-16
|
5 |
-
Removed the Sparrow POR benchmark from the leaderboard because of low quality annotations
|
6 |
Added HateBR Offensive, PT Hate Speech and tweetSentBR benchmarks to the leaderboard, started new evaluation queue for these benchmarks
|
7 |
|
8 |
### [1.0.0] - 2024-02-01
|
|
|
2 |
# Changes made to the leaderboard
|
3 |
|
4 |
### [1.1.0] - 2024-02-16
|
5 |
+
Removed the Sparrow POR benchmark from the leaderboard because of low quality annotations
|
6 |
Added HateBR Offensive, PT Hate Speech and tweetSentBR benchmarks to the leaderboard, started new evaluation queue for these benchmarks
|
7 |
|
8 |
### [1.0.0] - 2024-02-01
|
src/envs.py
CHANGED
@@ -65,4 +65,6 @@ GET_ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS = str2bool(get_config("GET_ORIGINAL_HF_
|
|
65 |
ORIGINAL_HF_LEADERBOARD_RESULTS_REPO = get_config("ORIGINAL_HF_LEADERBOARD_RESULTS_REPO", "open-llm-leaderboard/results")
|
66 |
ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, 'original_results')
|
67 |
|
|
|
|
|
68 |
API = HfApi(token=H4_TOKEN)
|
|
|
65 |
ORIGINAL_HF_LEADERBOARD_RESULTS_REPO = get_config("ORIGINAL_HF_LEADERBOARD_RESULTS_REPO", "open-llm-leaderboard/results")
|
66 |
ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, 'original_results')
|
67 |
|
68 |
+
SHOW_INCOMPLETE_EVALS = str2bool(get_config("SHOW_INCOMPLETE_EVALS", False))
|
69 |
+
|
70 |
API = HfApi(token=H4_TOKEN)
|
src/leaderboard/read_evals.py
CHANGED
@@ -12,7 +12,7 @@ from huggingface_hub import ModelCard
|
|
12 |
|
13 |
from src.display.formatting import make_clickable_model
|
14 |
from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType, ORIGINAL_TASKS
|
15 |
-
from src.envs import GET_ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS
|
16 |
|
17 |
@dataclass
|
18 |
class EvalResult:
|
@@ -216,7 +216,7 @@ def get_request_file_for_model(requests_path, model_name, precision):
|
|
216 |
with open(tmp_request_file, "r") as f:
|
217 |
req_content = json.load(f)
|
218 |
if (
|
219 |
-
req_content["status"] in ["FINISHED", "PENDING_NEW_EVAL"]
|
220 |
and req_content["precision"] == precision.split(".")[-1]
|
221 |
):
|
222 |
request_file = tmp_request_file
|
@@ -262,7 +262,7 @@ def get_raw_eval_results(results_path: str, requests_path: str, dynamic_path: st
|
|
262 |
results = []
|
263 |
for v in eval_results.values():
|
264 |
try:
|
265 |
-
if v.status in ["FINISHED", "PENDING_NEW_EVAL"] and not v.hidden:
|
266 |
v.to_dict() # we test if the dict version is complete
|
267 |
results.append(v)
|
268 |
except KeyError as e: # not all eval values present
|
|
|
12 |
|
13 |
from src.display.formatting import make_clickable_model
|
14 |
from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType, ORIGINAL_TASKS
|
15 |
+
from src.envs import GET_ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS, SHOW_INCOMPLETE_EVALS
|
16 |
|
17 |
@dataclass
|
18 |
class EvalResult:
|
|
|
216 |
with open(tmp_request_file, "r") as f:
|
217 |
req_content = json.load(f)
|
218 |
if (
|
219 |
+
req_content["status"] in ["FINISHED", "PENDING_NEW_EVAL" if SHOW_INCOMPLETE_EVALS else "FINISHED"]
|
220 |
and req_content["precision"] == precision.split(".")[-1]
|
221 |
):
|
222 |
request_file = tmp_request_file
|
|
|
262 |
results = []
|
263 |
for v in eval_results.values():
|
264 |
try:
|
265 |
+
if v.status in ["FINISHED", "PENDING_NEW_EVAL" if SHOW_INCOMPLETE_EVALS else "FINISHED"] and not v.hidden:
|
266 |
v.to_dict() # we test if the dict version is complete
|
267 |
results.append(v)
|
268 |
except KeyError as e: # not all eval values present
|
src/populate.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1 |
import json
|
2 |
import os
|
|
|
3 |
|
4 |
import pandas as pd
|
5 |
|
@@ -9,7 +10,7 @@ from src.leaderboard.filter_models import filter_models_flags
|
|
9 |
from src.leaderboard.read_evals import get_raw_eval_results
|
10 |
|
11 |
|
12 |
-
def get_leaderboard_df(results_path: str, requests_path: str, dynamic_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
|
13 |
raw_data = get_raw_eval_results(results_path=results_path, requests_path=requests_path, dynamic_path=dynamic_path)
|
14 |
all_data_json = [v.to_dict() for v in raw_data]
|
15 |
all_data_json.append(baseline_row)
|
@@ -21,11 +22,12 @@ def get_leaderboard_df(results_path: str, requests_path: str, dynamic_path: str,
|
|
21 |
df = df[cols].round(decimals=2)
|
22 |
|
23 |
# filter out if any of the benchmarks have not been produced
|
24 |
-
|
|
|
25 |
return raw_data, df
|
26 |
|
27 |
|
28 |
-
def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
|
29 |
entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
|
30 |
all_evals = []
|
31 |
|
@@ -51,12 +53,23 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
|
|
51 |
data[EvalQueueColumn.revision.name] = data.get("revision", "main")
|
52 |
all_evals.append(data)
|
53 |
|
54 |
-
|
|
|
|
|
|
|
|
|
55 |
running_list = [e for e in all_evals if e["status"] == "RUNNING"]
|
56 |
-
finished_list = [e for e in all_evals if e["status"]
|
57 |
failed_list = [e for e in all_evals if e["status"] == "FAILED"]
|
58 |
-
df_pending = pd.DataFrame.from_records(pending_list, columns=
|
59 |
df_running = pd.DataFrame.from_records(running_list, columns=cols)
|
60 |
df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
|
61 |
df_failed = pd.DataFrame.from_records(failed_list, columns=cols)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
62 |
return df_finished[cols], df_running[cols], df_pending[cols], df_failed[cols]
|
|
|
1 |
import json
|
2 |
import os
|
3 |
+
import copy
|
4 |
|
5 |
import pandas as pd
|
6 |
|
|
|
10 |
from src.leaderboard.read_evals import get_raw_eval_results
|
11 |
|
12 |
|
13 |
+
def get_leaderboard_df(results_path: str, requests_path: str, dynamic_path: str, cols: list, benchmark_cols: list, show_incomplete=False) -> pd.DataFrame:
|
14 |
raw_data = get_raw_eval_results(results_path=results_path, requests_path=requests_path, dynamic_path=dynamic_path)
|
15 |
all_data_json = [v.to_dict() for v in raw_data]
|
16 |
all_data_json.append(baseline_row)
|
|
|
22 |
df = df[cols].round(decimals=2)
|
23 |
|
24 |
# filter out if any of the benchmarks have not been produced
|
25 |
+
if not show_incomplete:
|
26 |
+
df = df[has_no_nan_values(df, benchmark_cols)]
|
27 |
return raw_data, df
|
28 |
|
29 |
|
30 |
+
def get_evaluation_queue_df(save_path: str, cols: list, show_incomplete=False) -> list[pd.DataFrame]:
|
31 |
entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
|
32 |
all_evals = []
|
33 |
|
|
|
53 |
data[EvalQueueColumn.revision.name] = data.get("revision", "main")
|
54 |
all_evals.append(data)
|
55 |
|
56 |
+
cols_pending = copy.deepcopy(cols)
|
57 |
+
cols_pending.append('source')
|
58 |
+
cols_pending.append('submitted_time')
|
59 |
+
|
60 |
+
pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN", "PENDING_NEW_EVAL"]]
|
61 |
running_list = [e for e in all_evals if e["status"] == "RUNNING"]
|
62 |
+
finished_list = [e for e in all_evals if e["status"] in ["FINISHED", "PENDING_NEW_EVAL" if show_incomplete else "FINISHED"]]
|
63 |
failed_list = [e for e in all_evals if e["status"] == "FAILED"]
|
64 |
+
df_pending = pd.DataFrame.from_records(pending_list, columns=cols_pending)
|
65 |
df_running = pd.DataFrame.from_records(running_list, columns=cols)
|
66 |
df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
|
67 |
df_failed = pd.DataFrame.from_records(failed_list, columns=cols)
|
68 |
+
|
69 |
+
df_pending['source_priority'] = df_pending["source"].apply(lambda x: {"manual": 0, "leaderboard": 1, "script": 2}.get(x, 3))
|
70 |
+
df_pending['status_priority'] = df_pending["status"].apply(lambda x: {"PENDING": 2, "RERUN": 0, "PENDING_NEW_EVAL": 1}.get(x, 3))
|
71 |
+
|
72 |
+
df_pending = df_pending.sort_values(['source_priority', 'status_priority', 'submitted_time'])
|
73 |
+
df_pending = df_pending.drop(['source_priority', 'status_priority', 'submitted_time', 'source'], axis=1)
|
74 |
+
|
75 |
return df_finished[cols], df_running[cols], df_pending[cols], df_failed[cols]
|
tasks_config/pt_config.yaml
CHANGED
@@ -10,6 +10,7 @@ config:
|
|
10 |
LEADERBOARD_NAME: "Open PT-LLM Leaderboard"
|
11 |
GET_ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS: true
|
12 |
TRUST_REMOTE_CODE: true
|
|
|
13 |
readme:
|
14 |
general_description: |
|
15 |
π The π Open PT LLM Leaderboard aims to provide a benchmark for the evaluation of
|
|
|
10 |
LEADERBOARD_NAME: "Open PT-LLM Leaderboard"
|
11 |
GET_ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS: true
|
12 |
TRUST_REMOTE_CODE: true
|
13 |
+
SHOW_INCOMPLETE_EVALS: false
|
14 |
readme:
|
15 |
general_description: |
|
16 |
π The π Open PT LLM Leaderboard aims to provide a benchmark for the evaluation of
|