Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Sean Cho
commited on
Commit
•
bba982c
1
Parent(s):
fced05c
add private repo
Browse files- .gitignore +2 -0
- app.py +4 -3
- src/assets/hardcoded_evals.py +0 -26
- src/display_models/read_results.py +3 -2
- src/load_from_hub.py +1 -7
.gitignore
CHANGED
@@ -10,6 +10,8 @@ gpt_4_evals/
|
|
10 |
human_evals/
|
11 |
eval-queue/
|
12 |
eval-results/
|
|
|
|
|
13 |
auto_evals/
|
14 |
|
15 |
src/assets/model_counts.html
|
|
|
10 |
human_evals/
|
11 |
eval-queue/
|
12 |
eval-results/
|
13 |
+
eval-queue-private/
|
14 |
+
eval-results-private/
|
15 |
auto_evals/
|
16 |
|
17 |
src/assets/model_counts.html
|
app.py
CHANGED
@@ -2,6 +2,7 @@ import json
|
|
2 |
import os
|
3 |
from datetime import datetime, timezone
|
4 |
import re
|
|
|
5 |
|
6 |
import gradio as gr
|
7 |
import pandas as pd
|
@@ -38,10 +39,10 @@ H4_TOKEN = os.environ.get("H4_TOKEN", None)
|
|
38 |
QUEUE_REPO = "open-ko-llm-leaderboard/requests"
|
39 |
RESULTS_REPO = "open-ko-llm-leaderboard/results"
|
40 |
|
41 |
-
PRIVATE_QUEUE_REPO = "open-ko-llm-leaderboard/requests"
|
42 |
-
PRIVATE_RESULTS_REPO = "open-ko-llm-leaderboard/results"
|
43 |
|
44 |
-
IS_PUBLIC = bool(os.environ.get("IS_PUBLIC", True))
|
45 |
|
46 |
EVAL_REQUESTS_PATH = "eval-queue"
|
47 |
EVAL_RESULTS_PATH = "eval-results"
|
|
|
2 |
import os
|
3 |
from datetime import datetime, timezone
|
4 |
import re
|
5 |
+
from distutils.util import strtobool
|
6 |
|
7 |
import gradio as gr
|
8 |
import pandas as pd
|
|
|
39 |
QUEUE_REPO = "open-ko-llm-leaderboard/requests"
|
40 |
RESULTS_REPO = "open-ko-llm-leaderboard/results"
|
41 |
|
42 |
+
PRIVATE_QUEUE_REPO = "open-ko-llm-leaderboard/private-requests"
|
43 |
+
PRIVATE_RESULTS_REPO = "open-ko-llm-leaderboard/private-results"
|
44 |
|
45 |
+
IS_PUBLIC = bool(strtobool(os.environ.get("IS_PUBLIC", "True")))
|
46 |
|
47 |
EVAL_REQUESTS_PATH = "eval-queue"
|
48 |
EVAL_RESULTS_PATH = "eval-results"
|
src/assets/hardcoded_evals.py
CHANGED
@@ -1,31 +1,5 @@
|
|
1 |
from src.display_models.utils import AutoEvalColumn, model_hyperlink
|
2 |
|
3 |
-
gpt4_values = {
|
4 |
-
AutoEvalColumn.model.name: model_hyperlink("https://arxiv.org/abs/2303.08774", "gpt4"),
|
5 |
-
AutoEvalColumn.revision.name: "tech report",
|
6 |
-
AutoEvalColumn.precision.name: None,
|
7 |
-
AutoEvalColumn.average.name: 84.3,
|
8 |
-
AutoEvalColumn.arc.name: 96.3,
|
9 |
-
AutoEvalColumn.hellaswag.name: 95.3,
|
10 |
-
AutoEvalColumn.mmlu.name: 86.4,
|
11 |
-
AutoEvalColumn.truthfulqa.name: 59.0,
|
12 |
-
AutoEvalColumn.dummy.name: "GPT-4",
|
13 |
-
AutoEvalColumn.model_type.name: "",
|
14 |
-
}
|
15 |
-
|
16 |
-
gpt35_values = {
|
17 |
-
AutoEvalColumn.model.name: model_hyperlink("https://arxiv.org/abs/2303.08774", "gpt3.5"),
|
18 |
-
AutoEvalColumn.revision.name: "tech report",
|
19 |
-
AutoEvalColumn.precision.name: None,
|
20 |
-
AutoEvalColumn.average.name: 71.9,
|
21 |
-
AutoEvalColumn.arc.name: 85.2,
|
22 |
-
AutoEvalColumn.hellaswag.name: 85.5,
|
23 |
-
AutoEvalColumn.mmlu.name: 70.0,
|
24 |
-
AutoEvalColumn.truthfulqa.name: 47.0,
|
25 |
-
AutoEvalColumn.dummy.name: "GPT-3.5",
|
26 |
-
AutoEvalColumn.model_type.name: "",
|
27 |
-
}
|
28 |
-
|
29 |
baseline = {
|
30 |
AutoEvalColumn.model.name: "<p>Baseline</p>",
|
31 |
AutoEvalColumn.revision.name: "N/A",
|
|
|
1 |
from src.display_models.utils import AutoEvalColumn, model_hyperlink
|
2 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
baseline = {
|
4 |
AutoEvalColumn.model.name: "<p>Baseline</p>",
|
5 |
AutoEvalColumn.revision.name: "N/A",
|
src/display_models/read_results.py
CHANGED
@@ -2,6 +2,7 @@ import json
|
|
2 |
import os
|
3 |
from dataclasses import dataclass
|
4 |
from typing import Dict, List, Tuple
|
|
|
5 |
|
6 |
import dateutil
|
7 |
import numpy as np
|
@@ -19,7 +20,7 @@ BENCH_TO_NAME = {
|
|
19 |
# TODO: Uncomment when we have results for these
|
20 |
# "ethicalverification": AutoEvalColumn.ethicalverification.name,
|
21 |
}
|
22 |
-
|
23 |
|
24 |
@dataclass
|
25 |
class EvalResult:
|
@@ -114,7 +115,7 @@ def parse_eval_result(json_filepath: str) -> Tuple[str, list[dict]]:
|
|
114 |
def get_eval_results() -> List[EvalResult]:
|
115 |
json_filepaths = []
|
116 |
|
117 |
-
for root, dir, files in os.walk("eval-results"):
|
118 |
# We should only have json files in model results
|
119 |
if len(files) == 0 or any([not f.endswith(".json") for f in files]):
|
120 |
continue
|
|
|
2 |
import os
|
3 |
from dataclasses import dataclass
|
4 |
from typing import Dict, List, Tuple
|
5 |
+
from distutils.util import strtobool
|
6 |
|
7 |
import dateutil
|
8 |
import numpy as np
|
|
|
20 |
# TODO: Uncomment when we have results for these
|
21 |
# "ethicalverification": AutoEvalColumn.ethicalverification.name,
|
22 |
}
|
23 |
+
IS_PUBLIC = bool(strtobool(os.environ.get("IS_PUBLIC", "True")))
|
24 |
|
25 |
@dataclass
|
26 |
class EvalResult:
|
|
|
115 |
def get_eval_results() -> List[EvalResult]:
|
116 |
json_filepaths = []
|
117 |
|
118 |
+
for root, dir, files in os.walk("eval-results" + ("-private" if not IS_PUBLIC else "")):
|
119 |
# We should only have json files in model results
|
120 |
if len(files) == 0 or any([not f.endswith(".json") for f in files]):
|
121 |
continue
|
src/load_from_hub.py
CHANGED
@@ -6,13 +6,11 @@ from huggingface_hub import Repository
|
|
6 |
from transformers import AutoConfig
|
7 |
from collections import defaultdict
|
8 |
|
9 |
-
from src.assets.hardcoded_evals import baseline
|
10 |
from src.display_models.get_model_metadata import apply_metadata
|
11 |
from src.display_models.read_results import get_eval_results_dicts, make_clickable_model
|
12 |
from src.display_models.utils import AutoEvalColumn, EvalQueueColumn, has_no_nan_values
|
13 |
|
14 |
-
IS_PUBLIC = bool(os.environ.get("IS_PUBLIC", True))
|
15 |
-
|
16 |
|
17 |
def get_all_requested_models(requested_models_dir: str) -> set[str]:
|
18 |
depth = 1
|
@@ -75,10 +73,6 @@ def get_leaderboard_df(
|
|
75 |
|
76 |
all_data = get_eval_results_dicts()
|
77 |
|
78 |
-
if not IS_PUBLIC:
|
79 |
-
all_data.append(gpt4_values)
|
80 |
-
all_data.append(gpt35_values)
|
81 |
-
|
82 |
# all_data.append(baseline)
|
83 |
apply_metadata(all_data) # Populate model type based on known hardcoded values in `metadata.py`
|
84 |
|
|
|
6 |
from transformers import AutoConfig
|
7 |
from collections import defaultdict
|
8 |
|
9 |
+
from src.assets.hardcoded_evals import baseline
|
10 |
from src.display_models.get_model_metadata import apply_metadata
|
11 |
from src.display_models.read_results import get_eval_results_dicts, make_clickable_model
|
12 |
from src.display_models.utils import AutoEvalColumn, EvalQueueColumn, has_no_nan_values
|
13 |
|
|
|
|
|
14 |
|
15 |
def get_all_requested_models(requested_models_dir: str) -> set[str]:
|
16 |
depth = 1
|
|
|
73 |
|
74 |
all_data = get_eval_results_dicts()
|
75 |
|
|
|
|
|
|
|
|
|
76 |
# all_data.append(baseline)
|
77 |
apply_metadata(all_data) # Populate model type based on known hardcoded values in `metadata.py`
|
78 |
|