Spaces:
Runtime error
Runtime error
Yeoni Rhee
commited on
Commit
·
e5ad674
1
Parent(s):
a987038
Final Sotopia Task Submission
Browse files- sotopia_space/benchmark.py +6 -4
- sotopia_space/constants.py +25 -0
- sotopia_space/utils.py +38 -2
sotopia_space/benchmark.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
import gradio as gr # type: ignore
|
2 |
import pandas as pd
|
3 |
from sotopia_space.constants import MODEL_OPTIONS
|
4 |
-
from sotopia_space.utils import
|
5 |
|
6 |
LP_MODE = "v2"
|
7 |
original_df, ablation_df = None, None
|
@@ -16,13 +16,15 @@ def benchmark_table():
|
|
16 |
|
17 |
with gr.TabItem("Vs GPT-3.5", elem_id="od-benchmark-tab-table-ablation", id=0, elem_classes="subtab"):
|
18 |
original_df = pd.read_json('data_dir/models_vs_gpt35.jsonl', lines=True)
|
19 |
-
default_main_df =
|
20 |
default_main_df = default_main_df.sort_values(by="GOAL [0, 10]", ascending=False)
|
|
|
21 |
# add a Rank column to the first columnn (starting from 1)
|
22 |
default_main_df.insert(0, "Rank", range(1, 1 + len(default_main_df)))
|
|
|
23 |
with gr.Row():
|
24 |
with gr.Column(scale=4):
|
25 |
-
gr.Markdown("
|
26 |
TYPES = ["number", "markdown", "number"]
|
27 |
leaderboard_table = gr.components.Dataframe(
|
28 |
value=default_main_df,
|
@@ -33,4 +35,4 @@ def benchmark_table():
|
|
33 |
interactive=False,
|
34 |
visible=True,
|
35 |
min_width=60,
|
36 |
-
)
|
|
|
1 |
import gradio as gr # type: ignore
|
2 |
import pandas as pd
|
3 |
from sotopia_space.constants import MODEL_OPTIONS
|
4 |
+
from sotopia_space.utils import post_processing
|
5 |
|
6 |
LP_MODE = "v2"
|
7 |
original_df, ablation_df = None, None
|
|
|
16 |
|
17 |
with gr.TabItem("Vs GPT-3.5", elem_id="od-benchmark-tab-table-ablation", id=0, elem_classes="subtab"):
|
18 |
original_df = pd.read_json('data_dir/models_vs_gpt35.jsonl', lines=True)
|
19 |
+
default_main_df = original_df
|
20 |
default_main_df = default_main_df.sort_values(by="GOAL [0, 10]", ascending=False)
|
21 |
+
default_main_df = post_processing(default_main_df, None)
|
22 |
# add a Rank column to the first columnn (starting from 1)
|
23 |
default_main_df.insert(0, "Rank", range(1, 1 + len(default_main_df)))
|
24 |
+
|
25 |
with gr.Row():
|
26 |
with gr.Column(scale=4):
|
27 |
+
gr.Markdown("<h3>**Vs GPT3.5**: The interlocutors are compared against GPT-3.5, the baseline model.")
|
28 |
TYPES = ["number", "markdown", "number"]
|
29 |
leaderboard_table = gr.components.Dataframe(
|
30 |
value=default_main_df,
|
|
|
35 |
interactive=False,
|
36 |
visible=True,
|
37 |
min_width=60,
|
38 |
+
)
|
sotopia_space/constants.py
CHANGED
@@ -12,3 +12,28 @@ MODEL_OPTIONS = [
|
|
12 |
# "together_ai/togethercomputer/llama-2-7b-chat",
|
13 |
# "together_ai/togethercomputer/falcon-7b-instruct",
|
14 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
# "together_ai/togethercomputer/llama-2-7b-chat",
|
13 |
# "together_ai/togethercomputer/falcon-7b-instruct",
|
14 |
]
|
15 |
+
|
16 |
+
MODEL_INFO = {
|
17 |
+
"Llama-2-13b-chat-hf.nosp": {"pretty_name": "Llama-2-13B-chat", "hf_model_id": "meta-llama/Llama-2-13b-chat-hf"},
|
18 |
+
"Llama-2-70b-chat-hf.nosp": {"pretty_name": "Llama-2-70B-chat", "hf_model_id": "meta-llama/Llama-2-70b-chat-hf"},
|
19 |
+
"Llama-2-7b-chat-hf.nosp": {"pretty_name": "Llama-2-7B-chat", "hf_model_id": "meta-llama/Llama-2-7b-chat-hf"},
|
20 |
+
"Llama-2-7b-chat-hf": {"pretty_name": "Llama-2-7B-chat (+sys prmpt)", "hf_model_id": "meta-llama/Llama-2-7b-chat-hf"},
|
21 |
+
"Mistral-7B-Instruct-v0.1": {"pretty_name": "Mistral-7B-Instruct", "hf_model_id": "mistralai/Mistral-7B-Instruct-v0.1"},
|
22 |
+
"Mistral-7B-Instruct-v0.2": {"pretty_name": "Mistral-7B-Instruct (v0.2)", "hf_model_id": "mistralai/Mistral-7B-Instruct-v0.2"},
|
23 |
+
"Mixtral-8x7B-Instruct-v0.1": {"pretty_name": "Mixtral-8x7B-Instruct", "hf_model_id": "mistralai/Mixtral-8x7B-Instruct-v0.1"},
|
24 |
+
"Nous-Hermes-2-Mixtral-8x7B-DPO": {"pretty_name": "Nous-Hermes-2-Mixtral-8x7B-DPO", "hf_model_id": "NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO"},
|
25 |
+
"Yi-34B-Chat": {"pretty_name": "Yi-34B-Chat", "hf_model_id": "01-ai/Yi-34B"},
|
26 |
+
"gemini-1.0-pro": {"pretty_name": "gemini-1.0-pro", "hf_model_id": "https://blog.google/technology/ai/google-gemini-ai/"},
|
27 |
+
"gemma-7b-it": {"pretty_name": "Gemma-7B-it", "hf_model_id": "google/gemma-7b"},
|
28 |
+
"gpt-3.5-turbo-0125": {"pretty_name": "gpt-3.5-turbo-0125", "hf_model_id": "https://platform.openai.com/"},
|
29 |
+
"gpt-4-0125-preview": {"pretty_name": "gpt-4-0125-preview", "hf_model_id": "https://platform.openai.com/"},
|
30 |
+
"tulu-2-dpo-70b": {"pretty_name": "Tulu-2-dpo-70b", "hf_model_id": "cmu-lti/tulu-2-dpo-70b"},
|
31 |
+
"vicuna-13b-v1.5": {"pretty_name": "Vicuna-13b-v1.5", "hf_model_id": "lmsys/vicuna-13b-v1.5"},
|
32 |
+
"zephyr-7b-beta": {"pretty_name": "Zephyr-7b-beta", "hf_model_id": "HuggingFaceH4/zephyr-7b-beta"},
|
33 |
+
"mistral-large-2402": {"pretty_name": "Mistral-Large", "hf_model_id": "https://mistral.ai/news/mistral-large/"},
|
34 |
+
"claude-3-opus-20240229": {"pretty_name": "Claude 3 Opus", "hf_model_id": "https://www.anthropic.com/claude"},
|
35 |
+
"claude-3-sonnet-20240229": {"pretty_name": "Claude 3 Sonnet", "hf_model_id": "https://www.anthropic.com/claude"},
|
36 |
+
"zephyr-7b-gemma-v0.1": {"pretty_name": "Zephyr-7b-Gemma", "hf_model_id": "HuggingFaceH4/zephyr-7b-gemma-v0.1"},
|
37 |
+
"Starling-LM-7B-beta": {"pretty_name": "StarlingLM-7B-beta", "hf_model_id": "Nexusflow/Starling-LM-7B-beta"},
|
38 |
+
"dbrx-instruct": {"pretty_name": "DBRX Instruct", "hf_model_id": "databricks/dbrx-instruct"}
|
39 |
+
}
|
sotopia_space/utils.py
CHANGED
@@ -1,3 +1,39 @@
|
|
|
|
|
|
1 |
|
2 |
-
|
3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from ui_constants import column_names
|
2 |
+
from sotopia_space.constants import MODEL_INFO
|
3 |
|
4 |
+
|
5 |
+
def make_clickable_model(model_name):
|
6 |
+
global MODEL_INFO
|
7 |
+
if model_name in MODEL_INFO:
|
8 |
+
if MODEL_INFO[model_name]["hf_model_id"].startswith("http"):
|
9 |
+
link = MODEL_INFO[model_name]["hf_model_id"]
|
10 |
+
return f'🔒 <a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{MODEL_INFO[model_name]["pretty_name"]}</a>'
|
11 |
+
else:
|
12 |
+
link = f"https://huggingface.co/{MODEL_INFO[model_name]['hf_model_id']}"
|
13 |
+
return f'🔥 <a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{MODEL_INFO[model_name]["pretty_name"]}</a>'
|
14 |
+
else:
|
15 |
+
return model_name
|
16 |
+
|
17 |
+
# Formats the columns
|
18 |
+
def formatter(x):
|
19 |
+
if type(x) is str:
|
20 |
+
x = x
|
21 |
+
else:
|
22 |
+
x = round(x, 2)
|
23 |
+
return x
|
24 |
+
|
25 |
+
def post_processing(df, model_len_info):
|
26 |
+
if model_len_info:
|
27 |
+
df["Length"] = df["model_name"].apply(lambda x: model_len_info[x]["avg_len"])
|
28 |
+
|
29 |
+
for col in df.columns:
|
30 |
+
if col == "model_name":
|
31 |
+
df[col] = df[col].apply(lambda x: x.replace(x, make_clickable_model(x)))
|
32 |
+
else:
|
33 |
+
df[col] = df[col].apply(formatter) # For numerical values
|
34 |
+
df.rename(columns=column_names, inplace=True)
|
35 |
+
df.sort_values(by="GOAL [0, 10]", inplace=True, ascending=False)
|
36 |
+
# put the "Overall Elo" and "Task-Avg Elo" column to the front
|
37 |
+
# add the length info
|
38 |
+
df = df[["model_name", "GOAL [0, 10]"] + [col for col in df.columns if col not in ["model_name", "GOAL [0, 10]"]]]
|
39 |
+
return df
|