Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
pminervini
commited on
Commit
•
c0e342c
1
Parent(s):
d9f893d
update
Browse files- beta-cli.py +6 -1
- src/leaderboard/read_evals.py +6 -0
- src/populate.py +1 -1
beta-cli.py
CHANGED
@@ -44,7 +44,7 @@ eval_requests: list[EvalRequest] = get_eval_requests(job_status=current_finished
|
|
44 |
# Sort the evals by priority (first submitted first run)
|
45 |
eval_requests: list[EvalRequest] = sort_models_by_priority(api=API, models=eval_requests)
|
46 |
|
47 |
-
eval_results: list[EvalResult] = get_raw_eval_results(
|
48 |
|
49 |
result_name_to_request = {request_to_result_name(r): r for r in eval_requests}
|
50 |
result_name_to_result = {r.eval_name: r for r in eval_results}
|
@@ -64,3 +64,8 @@ for eval_request in eval_requests:
|
|
64 |
|
65 |
if task_name not in eval_result.results:
|
66 |
print('RUN THIS ONE!', result_name, task_name)
|
|
|
|
|
|
|
|
|
|
|
|
44 |
# Sort the evals by priority (first submitted first run)
|
45 |
eval_requests: list[EvalRequest] = sort_models_by_priority(api=API, models=eval_requests)
|
46 |
|
47 |
+
eval_results: list[EvalResult] = get_raw_eval_results(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH)
|
48 |
|
49 |
result_name_to_request = {request_to_result_name(r): r for r in eval_requests}
|
50 |
result_name_to_result = {r.eval_name: r for r in eval_results}
|
|
|
64 |
|
65 |
if task_name not in eval_result.results:
|
66 |
print('RUN THIS ONE!', result_name, task_name)
|
67 |
+
|
68 |
+
raw_data = get_raw_eval_results(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH)
|
69 |
+
all_data_json = [v.to_dict() for v in raw_data if v.is_complete()]
|
70 |
+
|
71 |
+
breakpoint()
|
src/leaderboard/read_evals.py
CHANGED
@@ -123,6 +123,12 @@ class EvalResult:
|
|
123 |
except Exception:
|
124 |
print(f"Could not find request file for {self.org}/{self.model}")
|
125 |
|
|
|
|
|
|
|
|
|
|
|
|
|
126 |
def to_dict(self):
|
127 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
128 |
average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
|
|
|
123 |
except Exception:
|
124 |
print(f"Could not find request file for {self.org}/{self.model}")
|
125 |
|
126 |
+
def is_complete(self) -> bool:
|
127 |
+
for task in Tasks:
|
128 |
+
if task.value.benchmark not in self.results:
|
129 |
+
return False
|
130 |
+
return True
|
131 |
+
|
132 |
def to_dict(self):
|
133 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
134 |
average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
|
src/populate.py
CHANGED
@@ -15,7 +15,7 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
|
|
15 |
# EvalResult(eval_name='EleutherAI_pythia-1.3b_torch.float16', full_model='EleutherAI/pythia-1.3b', org='EleutherAI', model='pythia-1.3b', revision='34b668ff0acfe56f2d541aa46b385557ee39eb3f', results={'arc:challenge': 31.14334470989761, 'hellaswag': 51.43397729535949, 'hendrycksTest': 26.55151159544371, 'truthfulqa:mc': 39.24322830092449, 'winogrande': 57.37963693764798, 'gsm8k': 0.9855951478392722, 'drop': 4.056312919463095}, precision='torch.float16', model_type=<ModelType.PT: ModelTypeDetails(name='pretrained', symbol='🟢')>, weight_type='Original', architecture='GPTNeoXForCausalLM', license='apache-2.0', likes=7, num_params=1.312, date='2023-09-09T10:52:17Z', still_on_hub=True)
|
16 |
# EvalResult and get_raw_eval_results are defined in ./src/leaderboard/read_evals.py, the results slots are not hardcoded
|
17 |
raw_data = get_raw_eval_results(results_path, requests_path)
|
18 |
-
all_data_json = [v.to_dict() for v in raw_data]
|
19 |
# all_data_json.append(baseline_row)
|
20 |
filter_models(all_data_json)
|
21 |
|
|
|
15 |
# EvalResult(eval_name='EleutherAI_pythia-1.3b_torch.float16', full_model='EleutherAI/pythia-1.3b', org='EleutherAI', model='pythia-1.3b', revision='34b668ff0acfe56f2d541aa46b385557ee39eb3f', results={'arc:challenge': 31.14334470989761, 'hellaswag': 51.43397729535949, 'hendrycksTest': 26.55151159544371, 'truthfulqa:mc': 39.24322830092449, 'winogrande': 57.37963693764798, 'gsm8k': 0.9855951478392722, 'drop': 4.056312919463095}, precision='torch.float16', model_type=<ModelType.PT: ModelTypeDetails(name='pretrained', symbol='🟢')>, weight_type='Original', architecture='GPTNeoXForCausalLM', license='apache-2.0', likes=7, num_params=1.312, date='2023-09-09T10:52:17Z', still_on_hub=True)
|
16 |
# EvalResult and get_raw_eval_results are defined in ./src/leaderboard/read_evals.py, the results slots are not hardcoded
|
17 |
raw_data = get_raw_eval_results(results_path, requests_path)
|
18 |
+
all_data_json = [v.to_dict() for v in raw_data if v.is_complete()]
|
19 |
# all_data_json.append(baseline_row)
|
20 |
filter_models(all_data_json)
|
21 |
|