import glob import json import os from dataclasses import dataclass import numpy as np from src.display.formatting import make_clickable_model from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType from src.submission.check_validity import is_model_on_hub @dataclass class EvalResult: """Represents one full evaluation. Built from a combination of the result and request file for a given run. """ eval_name: str # org_model_precision (uid) full_model: str # org/model (path on hub) org: str model: str revision: str # commit hash, "" if main results: dict average_accuracy: float precision: Precision = Precision.Unknown model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ... weight_type: WeightType = WeightType.Original # Original or Adapter architecture: str = "Unknown" license: str = "?" likes: int = 0 num_params: int = 0 date: str = "" # submission date of request file still_on_hub: bool = False @classmethod def init_from_json_file(cls, json_filepath): """Inits the result from the specific model result file""" with open(json_filepath) as fp: data = json.load(fp) config = data.get("config", {}) # Precision precision = Precision.from_str(config.get("model_dtype", "Unknown")) # Get model and org org_and_model = config.get("model_name", "").split("/", 1) if len(org_and_model) == 1: org = None model = org_and_model[0] result_key = f"{model}_{precision.value.name}" else: org = org_and_model[0] model = org_and_model[1] result_key = f"{org}_{model}_{precision.value.name}" full_model = "/".join(org_and_model) results_data = data.get("results", {}) # Extract per-subject accuracies per_subject_results = {} for task in Tasks: subject = task.value.benchmark accuracy = results_data.get(subject, None) if accuracy is not None: per_subject_results[subject] = accuracy average_accuracy = results_data.get('average', None) # Set other fields from config model_type = ModelType.from_str(config.get("model_type", "")) weight_type = WeightType[config.get("weight_type", "Original")] license = config.get("license", "?") likes = config.get("likes", 0) num_params = config.get("params", 0) date = config.get("submitted_time", "") still_on_hub = config.get("still_on_hub", True) architecture = config.get("architecture", "Unknown") # Create EvalResult instance return cls( eval_name=result_key, full_model=full_model, org=org, model=model, results=per_subject_results, average_accuracy=average_accuracy, precision=precision, revision=config.get("model_sha", ""), still_on_hub=still_on_hub, architecture=architecture, model_type=model_type, weight_type=weight_type, license=license, likes=likes, num_params=num_params, date=date, ) def to_dict(self): """Converts the Eval Result to a dict compatible with our dataframe display""" data_dict = { "eval_name": self.eval_name, # not a column, just a save name, AutoEvalColumn.precision.name: self.precision.value.name, AutoEvalColumn.model_type.name: self.model_type.value.name, AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol, AutoEvalColumn.weight_type.name: self.weight_type.value.name, AutoEvalColumn.architecture.name: self.architecture, AutoEvalColumn.model.name: make_clickable_model(self.full_model), AutoEvalColumn.revision.name: self.revision, AutoEvalColumn.average.name: self.average_accuracy, AutoEvalColumn.license.name: self.license, AutoEvalColumn.likes.name: self.likes, AutoEvalColumn.params.name: self.num_params, AutoEvalColumn.still_on_hub.name: self.still_on_hub, } for task in Tasks: subject = task.value.benchmark data_dict[task.value.col_name] = self.results.get(subject, None) return data_dict def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]: """From the path of the results folder root, extract all needed info for results""" model_result_filepaths = [] for root, _, files in os.walk(results_path): # We should only have json files in model results for file in files: if file.endswith(".json"): model_result_filepaths.append(os.path.join(root, file)) eval_results = {} for model_result_filepath in model_result_filepaths: # Creation of result eval_result = EvalResult.init_from_json_file(model_result_filepath) # Store results eval_name = eval_result.eval_name eval_results[eval_name] = eval_result results = [] for v in eval_results.values(): try: v.to_dict() # we test if the dict version is complete results.append(v) except KeyError: # not all eval values present continue return results