|
from dataclasses import dataclass |
|
|
|
import glob |
|
import json |
|
import os |
|
from typing import Dict, List, Tuple |
|
|
|
from src.utils_display import AutoEvalColumn, make_clickable_model |
|
import numpy as np |
|
|
|
METRICS = ["acc_norm", "acc_norm", "acc", "mc2"] |
|
BENCHMARKS = ["arc:challenge", "hellaswag", "hendrycksTest", "truthfulqa:mc"] |
|
BENCH_TO_NAME = { |
|
"arc:challenge": AutoEvalColumn.arc.name, |
|
"hellaswag": AutoEvalColumn.hellaswag.name, |
|
"hendrycksTest": AutoEvalColumn.mmlu.name, |
|
"truthfulqa:mc": AutoEvalColumn.truthfulqa.name, |
|
} |
|
|
|
|
|
@dataclass |
|
class EvalResult: |
|
eval_name: str |
|
org: str |
|
model: str |
|
revision: str |
|
results: dict |
|
precision: str = "16bit" |
|
model_type: str = "" |
|
weight_type: str = "" |
|
|
|
def to_dict(self): |
|
if self.org is not None: |
|
base_model = f"{self.org}/{self.model}" |
|
else: |
|
base_model = f"{self.model}" |
|
data_dict = {} |
|
|
|
data_dict["eval_name"] = self.eval_name |
|
data_dict["weight_type"] = self.weight_type |
|
data_dict[AutoEvalColumn.precision.name] = self.precision |
|
data_dict[AutoEvalColumn.model_type.name] = self.model_type |
|
data_dict[AutoEvalColumn.model.name] = make_clickable_model(base_model) |
|
data_dict[AutoEvalColumn.dummy.name] = base_model |
|
data_dict[AutoEvalColumn.revision.name] = self.revision |
|
data_dict[AutoEvalColumn.average.name] = round( |
|
sum([v for k, v in self.results.items()]) / 4.0, 1 |
|
) |
|
|
|
for benchmark in BENCHMARKS: |
|
if benchmark not in self.results.keys(): |
|
self.results[benchmark] = None |
|
|
|
for k, v in BENCH_TO_NAME.items(): |
|
data_dict[v] = self.results[k] |
|
|
|
return data_dict |
|
|
|
|
|
def parse_eval_result(json_filepath: str) -> Tuple[str, list[dict]]: |
|
with open(json_filepath) as fp: |
|
data = json.load(fp) |
|
|
|
|
|
for mmlu_k in ["harness|hendrycksTest-abstract_algebra|5", "hendrycksTest-abstract_algebra"]: |
|
if mmlu_k in data["versions"] and data["versions"][mmlu_k] == 0: |
|
return None, [] |
|
|
|
try: |
|
config = data["config"] |
|
except KeyError: |
|
config = data["config_general"] |
|
model = config.get("model_name", None) |
|
if model is None: |
|
model = config.get("model_args", None) |
|
|
|
model_sha = config.get("model_sha", "") |
|
eval_sha = config.get("lighteval_sha", "") |
|
model_split = model.split("/", 1) |
|
|
|
model = model_split[-1] |
|
|
|
if len(model_split) == 1: |
|
org = None |
|
model = model_split[0] |
|
result_key = f"{model}_{model_sha}_{eval_sha}" |
|
else: |
|
org = model_split[0] |
|
model = model_split[1] |
|
result_key = f"{org}_{model}_{model_sha}_{eval_sha}" |
|
|
|
eval_results = [] |
|
for benchmark, metric in zip(BENCHMARKS, METRICS): |
|
accs = np.array([v[metric] for k, v in data["results"].items() if benchmark in k]) |
|
if accs.size == 0: |
|
continue |
|
mean_acc = round(np.mean(accs) * 100.0, 1) |
|
eval_results.append(EvalResult( |
|
eval_name=result_key, org=org, model=model, revision=model_sha, results={benchmark: mean_acc}, |
|
)) |
|
|
|
return result_key, eval_results |
|
|
|
|
|
def get_eval_results(is_public) -> List[EvalResult]: |
|
json_filepaths = glob.glob( |
|
"eval-results/**/results*.json", recursive=True |
|
) |
|
if not is_public: |
|
json_filepaths += glob.glob( |
|
"private-eval-results/**/results*.json", recursive=True |
|
) |
|
|
|
eval_results = {} |
|
|
|
for json_filepath in json_filepaths: |
|
result_key, results = parse_eval_result(json_filepath) |
|
for eval_result in results: |
|
if result_key in eval_results.keys(): |
|
eval_results[result_key].results.update(eval_result.results) |
|
else: |
|
eval_results[result_key] = eval_result |
|
|
|
eval_results = [v for v in eval_results.values()] |
|
|
|
return eval_results |
|
|
|
|
|
def get_eval_results_dicts(is_public=True) -> List[Dict]: |
|
eval_results = get_eval_results(is_public) |
|
|
|
return [e.to_dict() for e in eval_results] |
|
|