open_dutch_llm_leaderboard

Running

App Files Files Community

Bram Vanroy commited on Dec 13, 2023

Commit

0658988

•

1 Parent(s): 351f9fe

remove mixtral - was not tested in 8-bit

Browse files

Files changed (5) hide show

app.py +23 -6
evals/hellaswag/hellaswag_nl_Mistral-7B-v0.1.json +0 -23
evals/hellaswag/hellaswag_nl_Mixtral-8x7B-v0.1.json +0 -23
evals/models.json +0 -8
generate_overview_json.py +2 -1

app.py CHANGED Viewed

@@ -28,6 +28,8 @@ MODEL_TYPE_EMOJIS = {
     "RL-tuned": "🟦",
 }
 @dataclass
 class Result:
@@ -44,12 +46,14 @@ class Result:
     num_parameters_kmb: str = field(init=False)
     def __post_init__(self):
-        if self.model_type not in ["pretrained", "fine-tuned", "instruction-tuned", "RL-tuned"]:
             raise ValueError(
-                f"Model type {self.model_type} must be one of 'pretrained', 'fine-tuned', 'instruction-tuned', 'RL-tuned'"
             )
-        if self.dutch_coverage not in ["none", "pretrained", "fine-tuned"]:
-            raise ValueError(f"Dutch coverage {self.dutch_coverage} must be one of 'none', 'pretrained', 'fine-tuned'")
         field_names = {f.name for f in fields(self)}
         for task_name in TASK_METRICS:
@@ -128,8 +132,10 @@ class ResultSet:
                     f" dotted;'>{result.short_name}</a>"
                 )
                 if attr == "short_name"
-                else MODEL_TYPE_EMOJIS[result.model_type]
                 if attr == "model_type"
                 else getattr(result, attr)
                 for attr, col_name in self.column_names.items()
             }
@@ -203,8 +209,16 @@ def collect_results() -> ResultSet:
         if "results" not in data:
             continue
         task_results = data["results"]
         short_name = pfin.stem.split("_", 2)[2].lower()
         if short_name not in model_results:
             model_results[short_name] = {
                 "short_name": short_name,
@@ -228,7 +242,10 @@ with gr.Blocks() as demo:
     gr.HTML(TITLE)
     gr.Markdown(INTRO_TEXT)
-    gr.Markdown(f"## Leaderboard\nOnly representative for the Dutch version (`*_nl`) of the benchmarks!")
     results = collect_results()

     "RL-tuned": "🟦",
 }
+NOT_GIVEN_SYMBOL = "❔"
 @dataclass
 class Result:
     num_parameters_kmb: str = field(init=False)
     def __post_init__(self):
+        if self.model_type not in ["pretrained", "fine-tuned", "instruction-tuned", "RL-tuned", "not-given"]:
+            raise ValueError(
+                f"Model type {self.model_type} must be one of 'pretrained', 'fine-tuned', 'instruction-tuned', 'RL-tuned', 'not-given"
+            )
+        if self.dutch_coverage not in ["none", "pretrained", "fine-tuned", "not-given"]:
             raise ValueError(
+                f"Dutch coverage {self.dutch_coverage} must be one of 'none', 'pretrained', 'fine-tuned', 'not-given"
             )
         field_names = {f.name for f in fields(self)}
         for task_name in TASK_METRICS:
                     f" dotted;'>{result.short_name}</a>"
                 )
                 if attr == "short_name"
+                else MODEL_TYPE_EMOJIS.get(result.model_type, NOT_GIVEN_SYMBOL)
                 if attr == "model_type"
+                else (result.dutch_coverage if result.dutch_coverage != "not-given" else NOT_GIVEN_SYMBOL)
+                if attr == "dutch_coverage"
                 else getattr(result, attr)
                 for attr, col_name in self.column_names.items()
             }
         if "results" not in data:
             continue
         task_results = data["results"]
         short_name = pfin.stem.split("_", 2)[2].lower()
+        if short_name not in model_info:
+            raise KeyError(
+                f"Model {short_name} not found in overview file {pf_overview.name}. This means that a results JSON"
+                f" file exists that has not yet been processed. First run the `generate_overview_json.py` script."
+            )
         if short_name not in model_results:
             model_results[short_name] = {
                 "short_name": short_name,
     gr.HTML(TITLE)
     gr.Markdown(INTRO_TEXT)
+    gr.Markdown(
+        f"## Leaderboard\nOnly representative for the Dutch version (`*_nl`) of the benchmarks!"
+        " All models have been benchmarked in 8-bit."
+    )
     results = collect_results()

evals/hellaswag/hellaswag_nl_Mistral-7B-v0.1.json DELETED Viewed

@@ -1,23 +0,0 @@
-{
-  "results": {
-    "hellaswag_nl": {
-      "acc": 0.44079870480302213,
-      "acc_stderr": 0.005158280633507224,
-      "acc_norm": 0.5840259039395574,
-      "acc_norm_stderr": 0.005120942804814836
-    }
-  },
-  "versions": {
-    "hellaswag_nl": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=mistralai/Mistral-7B-v0.1,use_accelerate=True,device_map_option=auto,dtype=bfloat16,load_in_8bit=True",
-    "batch_size": "auto",
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}

evals/hellaswag/hellaswag_nl_Mixtral-8x7B-v0.1.json DELETED Viewed

@@ -1,23 +0,0 @@
-{
-  "results": {
-    "hellaswag_nl": {
-      "acc": 0.5143011332973556,
-      "acc_stderr": 0.0051926973681393875,
-      "acc_norm": 0.67835941716136,
-      "acc_norm_stderr": 0.004853064643337017
-    }
-  },
-  "versions": {
-    "hellaswag_nl": 1
-  },
-  "config": {
-    "model": "hf-auto",
-    "model_args": "pretrained=mistralai/Mixtral-8x7B-v0.1,use_accelerate=True,device_map_option=auto,dtype=auto",
-    "batch_size": 1,
-    "device": "cuda",
-    "no_cache": false,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}

evals/models.json CHANGED Viewed

@@ -87,14 +87,6 @@
         "num_parameters": 7241732096,
         "quantization": "8-bit"
     },
-    "mixtral-8x7b-v0.1": {
-        "compute_dtype": "auto",
-        "dutch_coverage": "not-given",
-        "model_name": "mistralai/Mixtral-8x7B-v0.1",
-        "model_type": "not-given",
-        "num_parameters": 46702792704,
-        "quantization": null
-    },
     "neural-chat-7b-v3-1": {
         "compute_dtype": "bfloat16",
         "dutch_coverage": "none",

         "num_parameters": 7241732096,
         "quantization": "8-bit"
     },
     "neural-chat-7b-v3-1": {
         "compute_dtype": "bfloat16",
         "dutch_coverage": "none",

generate_overview_json.py CHANGED Viewed

@@ -40,7 +40,8 @@ def main():
             "model_type": results[short_name]["model_type"]
             if short_name in results and "model_type" in results[short_name]
             else "not-given",
-            "dutch_coverage": results[short_name]["dutch_coverage"] if short_name in results and "dutch_coverage" in results[short_name]
             else "not-given",
         }

             "model_type": results[short_name]["model_type"]
             if short_name in results and "model_type" in results[short_name]
             else "not-given",
+            "dutch_coverage": results[short_name]["dutch_coverage"]
+            if short_name in results and "dutch_coverage" in results[short_name]
             else "not-given",
         }