Spaces:

open-llm-leaderboard
/

open_llm_leaderboard

Running on CPU Upgrade

App Files Files Community

1025

Alina Lozovskaia commited on May 20

Commit

d95d4a1

•

1 Parent(s): 9b7814c

apply code style and quality checks to read_evals.py

Browse files

Files changed (1) hide show

src/leaderboard/read_evals.py +27 -29

src/leaderboard/read_evals.py CHANGED Viewed

@@ -16,36 +16,36 @@ from src.display.formatting import make_clickable_model
 from src.display.utils import AutoEvalColumn, ModelType, Precision, Tasks, WeightType, parse_datetime
 # Configure logging
-logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 @dataclass
 class EvalResult:
     # Also see src.display.utils.AutoEvalColumn for what will be displayed.
-    eval_name: str # org_model_precision (uid)
-    full_model: str # org/model (path on hub)
     org: Optional[str]
     model: str
-    revision: str # commit hash, "" if main
     results: Dict[str, float]
     precision: Precision = Precision.Unknown
-    model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
     weight_type: WeightType = WeightType.Original
-    architecture: str = "Unknown" # From config file
     license: str = "?"
     likes: int = 0
     num_params: int = 0
-    date: str = "" # submission date of request file
     still_on_hub: bool = True
     is_merge: bool = False
     not_flagged: bool = False
     status: str = "FINISHED"
     # List of tags, initialized to a new empty list for each instance to avoid the pitfalls of mutable default arguments.
     tags: List[str] = field(default_factory=list)
     @classmethod
-    def init_from_json_file(cls, json_filepath: str) -> 'EvalResult':
-        with open(json_filepath, 'r') as fp:
             data = json.load(fp)
         config = data.get("config_general", {})
@@ -72,7 +72,7 @@ class EvalResult:
             model=model,
             results=results,
             precision=precision,
-            revision=config.get("model_sha", "")
         )
     @staticmethod
@@ -118,9 +118,8 @@ class EvalResult:
             mean_acc = np.mean(accs) * 100.0
             results[task.benchmark] = mean_acc
-        return results
     def update_with_request_file(self, requests_path):
         """Finds the relevant request file for the current model and updates info with it."""
@@ -130,17 +129,17 @@ class EvalResult:
                 logging.warning(f"No request file for {self.org}/{self.model}")
                 self.status = "FAILED"
                 return
             with open(request_file, "r") as f:
                 request = json.load(f)
             self.model_type = ModelType.from_str(request.get("model_type", "Unknown"))
             self.weight_type = WeightType[request.get("weight_type", "Original")]
             self.num_params = int(request.get("params", 0))  # Ensuring type safety
             self.date = request.get("submitted_time", "")
             self.architecture = request.get("architectures", "Unknown")
             self.status = request.get("status", "FAILED")
         except FileNotFoundError:
             self.status = "FAILED"
             logging.error(f"Request file: {request_file} not found for {self.org}/{self.model}")
@@ -154,7 +153,6 @@ class EvalResult:
             self.status = "FAILED"
             logging.error(f"Unexpected error {e} for {self.org}/{self.model}")
     def update_with_dynamic_file_dict(self, file_dict):
         """Update object attributes based on the provided dictionary, with error handling for missing keys and type validation."""
         # Default values set for optional or potentially missing keys.
@@ -162,11 +160,10 @@ class EvalResult:
         self.likes = int(file_dict.get("likes", 0))  # Ensure likes is treated as an integer
         self.still_on_hub = file_dict.get("still_on_hub", False)  # Default to False if key is missing
         self.tags = file_dict.get("tags", [])
         # Calculate `flagged` only if 'tags' is not empty and avoid calculating each time
         self.not_flagged = not (any("flagged" in tag for tag in self.tags))
     def to_dict(self):
         """Converts the Eval Result to a dict compatible with our dataframe display"""
         average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
@@ -185,8 +182,10 @@ class EvalResult:
             AutoEvalColumn.likes.name: self.likes,
             AutoEvalColumn.params.name: self.num_params,
             AutoEvalColumn.still_on_hub.name: self.still_on_hub,
-            AutoEvalColumn.merged.name: not( "merge" in self.tags if self.tags else False),
-            AutoEvalColumn.moe.name: not ( ("moe" in self.tags if self.tags else False) or "moe" in self.full_model.lower()) ,
             AutoEvalColumn.not_flagged.name: self.not_flagged,
         }
@@ -194,16 +193,16 @@ class EvalResult:
             data_dict[task.value.col_name] = self.results[task.value.benchmark]
         return data_dict
 def get_request_file_for_model(requests_path, model_name, precision):
     """Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
     requests_path = Path(requests_path)
     pattern = f"{model_name}_eval_request_*.json"
     # Using pathlib to find files matching the pattern
     request_files = list(requests_path.glob(pattern))
     # Sort the files by name in descending order to mimic 'reverse=True'
     request_files.sort(reverse=True)
@@ -214,7 +213,7 @@ def get_request_file_for_model(requests_path, model_name, precision):
             req_content = json.load(f)
             if req_content["status"] == "FINISHED" and req_content["precision"] == precision.split(".")[-1]:
                 request_file = str(request_file)
     # Return empty string if no file found that matches criteria
     return request_file
@@ -223,9 +222,9 @@ def get_raw_eval_results(results_path: str, requests_path: str, dynamic_path: st
     """From the path of the results folder root, extract all needed info for results"""
     with open(dynamic_path) as f:
         dynamic_data = json.load(f)
     results_path = Path(results_path)
-    model_files = list(results_path.rglob('results_*.json'))
     model_files.sort(key=lambda file: parse_datetime(file.stem.removeprefix("results_")))
     eval_results = {}
@@ -260,4 +259,3 @@ def get_raw_eval_results(results_path: str, requests_path: str, dynamic_path: st
             continue
     return results

 from src.display.utils import AutoEvalColumn, ModelType, Precision, Tasks, WeightType, parse_datetime
 # Configure logging
+logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
 @dataclass
 class EvalResult:
     # Also see src.display.utils.AutoEvalColumn for what will be displayed.
+    eval_name: str  # org_model_precision (uid)
+    full_model: str  # org/model (path on hub)
     org: Optional[str]
     model: str
+    revision: str  # commit hash, "" if main
     results: Dict[str, float]
     precision: Precision = Precision.Unknown
+    model_type: ModelType = ModelType.Unknown  # Pretrained, fine tuned, ...
     weight_type: WeightType = WeightType.Original
+    architecture: str = "Unknown"  # From config file
     license: str = "?"
     likes: int = 0
     num_params: int = 0
+    date: str = ""  # submission date of request file
     still_on_hub: bool = True
     is_merge: bool = False
     not_flagged: bool = False
     status: str = "FINISHED"
     # List of tags, initialized to a new empty list for each instance to avoid the pitfalls of mutable default arguments.
     tags: List[str] = field(default_factory=list)
     @classmethod
+    def init_from_json_file(cls, json_filepath: str) -> "EvalResult":
+        with open(json_filepath, "r") as fp:
             data = json.load(fp)
         config = data.get("config_general", {})
             model=model,
             results=results,
             precision=precision,
+            revision=config.get("model_sha", ""),
         )
     @staticmethod
             mean_acc = np.mean(accs) * 100.0
             results[task.benchmark] = mean_acc
+        return results
     def update_with_request_file(self, requests_path):
         """Finds the relevant request file for the current model and updates info with it."""
                 logging.warning(f"No request file for {self.org}/{self.model}")
                 self.status = "FAILED"
                 return
             with open(request_file, "r") as f:
                 request = json.load(f)
             self.model_type = ModelType.from_str(request.get("model_type", "Unknown"))
             self.weight_type = WeightType[request.get("weight_type", "Original")]
             self.num_params = int(request.get("params", 0))  # Ensuring type safety
             self.date = request.get("submitted_time", "")
             self.architecture = request.get("architectures", "Unknown")
             self.status = request.get("status", "FAILED")
         except FileNotFoundError:
             self.status = "FAILED"
             logging.error(f"Request file: {request_file} not found for {self.org}/{self.model}")
             self.status = "FAILED"
             logging.error(f"Unexpected error {e} for {self.org}/{self.model}")
     def update_with_dynamic_file_dict(self, file_dict):
         """Update object attributes based on the provided dictionary, with error handling for missing keys and type validation."""
         # Default values set for optional or potentially missing keys.
         self.likes = int(file_dict.get("likes", 0))  # Ensure likes is treated as an integer
         self.still_on_hub = file_dict.get("still_on_hub", False)  # Default to False if key is missing
         self.tags = file_dict.get("tags", [])
         # Calculate `flagged` only if 'tags' is not empty and avoid calculating each time
         self.not_flagged = not (any("flagged" in tag for tag in self.tags))
     def to_dict(self):
         """Converts the Eval Result to a dict compatible with our dataframe display"""
         average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
             AutoEvalColumn.likes.name: self.likes,
             AutoEvalColumn.params.name: self.num_params,
             AutoEvalColumn.still_on_hub.name: self.still_on_hub,
+            AutoEvalColumn.merged.name: not ("merge" in self.tags if self.tags else False),
+            AutoEvalColumn.moe.name: not (
+                ("moe" in self.tags if self.tags else False) or "moe" in self.full_model.lower()
+            ),
             AutoEvalColumn.not_flagged.name: self.not_flagged,
         }
             data_dict[task.value.col_name] = self.results[task.value.benchmark]
         return data_dict
 def get_request_file_for_model(requests_path, model_name, precision):
     """Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
     requests_path = Path(requests_path)
     pattern = f"{model_name}_eval_request_*.json"
     # Using pathlib to find files matching the pattern
     request_files = list(requests_path.glob(pattern))
     # Sort the files by name in descending order to mimic 'reverse=True'
     request_files.sort(reverse=True)
             req_content = json.load(f)
             if req_content["status"] == "FINISHED" and req_content["precision"] == precision.split(".")[-1]:
                 request_file = str(request_file)
     # Return empty string if no file found that matches criteria
     return request_file
     """From the path of the results folder root, extract all needed info for results"""
     with open(dynamic_path) as f:
         dynamic_data = json.load(f)
     results_path = Path(results_path)
+    model_files = list(results_path.rglob("results_*.json"))
     model_files.sort(key=lambda file: parse_datetime(file.stem.removeprefix("results_")))
     eval_results = {}
             continue
     return results