Yeoni Rhee commited on
Commit
e5ad674
·
1 Parent(s): a987038

Final Sotopia Task Submission

Browse files
sotopia_space/benchmark.py CHANGED
@@ -1,7 +1,7 @@
1
  import gradio as gr # type: ignore
2
  import pandas as pd
3
  from sotopia_space.constants import MODEL_OPTIONS
4
- from sotopia_space.utils import apply_length_penalty
5
 
6
  LP_MODE = "v2"
7
  original_df, ablation_df = None, None
@@ -16,13 +16,15 @@ def benchmark_table():
16
 
17
  with gr.TabItem("Vs GPT-3.5", elem_id="od-benchmark-tab-table-ablation", id=0, elem_classes="subtab"):
18
  original_df = pd.read_json('data_dir/models_vs_gpt35.jsonl', lines=True)
19
- default_main_df = apply_length_penalty(original_df, ablation_df, length_penalty=DEFAULT_LP, mode=LP_MODE, LP_original_dfs=LP_original_dfs)
20
  default_main_df = default_main_df.sort_values(by="GOAL [0, 10]", ascending=False)
 
21
  # add a Rank column to the first columnn (starting from 1)
22
  default_main_df.insert(0, "Rank", range(1, 1 + len(default_main_df)))
 
23
  with gr.Row():
24
  with gr.Column(scale=4):
25
- gr.Markdown("**Vs GPT3.5**: The interlocutors are compared against GPT-3.5, the baseline model.")
26
  TYPES = ["number", "markdown", "number"]
27
  leaderboard_table = gr.components.Dataframe(
28
  value=default_main_df,
@@ -33,4 +35,4 @@ def benchmark_table():
33
  interactive=False,
34
  visible=True,
35
  min_width=60,
36
- )
 
1
  import gradio as gr # type: ignore
2
  import pandas as pd
3
  from sotopia_space.constants import MODEL_OPTIONS
4
+ from sotopia_space.utils import post_processing
5
 
6
  LP_MODE = "v2"
7
  original_df, ablation_df = None, None
 
16
 
17
  with gr.TabItem("Vs GPT-3.5", elem_id="od-benchmark-tab-table-ablation", id=0, elem_classes="subtab"):
18
  original_df = pd.read_json('data_dir/models_vs_gpt35.jsonl', lines=True)
19
+ default_main_df = original_df
20
  default_main_df = default_main_df.sort_values(by="GOAL [0, 10]", ascending=False)
21
+ default_main_df = post_processing(default_main_df, None)
22
  # add a Rank column to the first columnn (starting from 1)
23
  default_main_df.insert(0, "Rank", range(1, 1 + len(default_main_df)))
24
+
25
  with gr.Row():
26
  with gr.Column(scale=4):
27
+ gr.Markdown("<h3>**Vs GPT3.5**: The interlocutors are compared against GPT-3.5, the baseline model.")
28
  TYPES = ["number", "markdown", "number"]
29
  leaderboard_table = gr.components.Dataframe(
30
  value=default_main_df,
 
35
  interactive=False,
36
  visible=True,
37
  min_width=60,
38
+ )
sotopia_space/constants.py CHANGED
@@ -12,3 +12,28 @@ MODEL_OPTIONS = [
12
  # "together_ai/togethercomputer/llama-2-7b-chat",
13
  # "together_ai/togethercomputer/falcon-7b-instruct",
14
  ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  # "together_ai/togethercomputer/llama-2-7b-chat",
13
  # "together_ai/togethercomputer/falcon-7b-instruct",
14
  ]
15
+
16
+ MODEL_INFO = {
17
+ "Llama-2-13b-chat-hf.nosp": {"pretty_name": "Llama-2-13B-chat", "hf_model_id": "meta-llama/Llama-2-13b-chat-hf"},
18
+ "Llama-2-70b-chat-hf.nosp": {"pretty_name": "Llama-2-70B-chat", "hf_model_id": "meta-llama/Llama-2-70b-chat-hf"},
19
+ "Llama-2-7b-chat-hf.nosp": {"pretty_name": "Llama-2-7B-chat", "hf_model_id": "meta-llama/Llama-2-7b-chat-hf"},
20
+ "Llama-2-7b-chat-hf": {"pretty_name": "Llama-2-7B-chat (+sys prmpt)", "hf_model_id": "meta-llama/Llama-2-7b-chat-hf"},
21
+ "Mistral-7B-Instruct-v0.1": {"pretty_name": "Mistral-7B-Instruct", "hf_model_id": "mistralai/Mistral-7B-Instruct-v0.1"},
22
+ "Mistral-7B-Instruct-v0.2": {"pretty_name": "Mistral-7B-Instruct (v0.2)", "hf_model_id": "mistralai/Mistral-7B-Instruct-v0.2"},
23
+ "Mixtral-8x7B-Instruct-v0.1": {"pretty_name": "Mixtral-8x7B-Instruct", "hf_model_id": "mistralai/Mixtral-8x7B-Instruct-v0.1"},
24
+ "Nous-Hermes-2-Mixtral-8x7B-DPO": {"pretty_name": "Nous-Hermes-2-Mixtral-8x7B-DPO", "hf_model_id": "NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO"},
25
+ "Yi-34B-Chat": {"pretty_name": "Yi-34B-Chat", "hf_model_id": "01-ai/Yi-34B"},
26
+ "gemini-1.0-pro": {"pretty_name": "gemini-1.0-pro", "hf_model_id": "https://blog.google/technology/ai/google-gemini-ai/"},
27
+ "gemma-7b-it": {"pretty_name": "Gemma-7B-it", "hf_model_id": "google/gemma-7b"},
28
+ "gpt-3.5-turbo-0125": {"pretty_name": "gpt-3.5-turbo-0125", "hf_model_id": "https://platform.openai.com/"},
29
+ "gpt-4-0125-preview": {"pretty_name": "gpt-4-0125-preview", "hf_model_id": "https://platform.openai.com/"},
30
+ "tulu-2-dpo-70b": {"pretty_name": "Tulu-2-dpo-70b", "hf_model_id": "cmu-lti/tulu-2-dpo-70b"},
31
+ "vicuna-13b-v1.5": {"pretty_name": "Vicuna-13b-v1.5", "hf_model_id": "lmsys/vicuna-13b-v1.5"},
32
+ "zephyr-7b-beta": {"pretty_name": "Zephyr-7b-beta", "hf_model_id": "HuggingFaceH4/zephyr-7b-beta"},
33
+ "mistral-large-2402": {"pretty_name": "Mistral-Large", "hf_model_id": "https://mistral.ai/news/mistral-large/"},
34
+ "claude-3-opus-20240229": {"pretty_name": "Claude 3 Opus", "hf_model_id": "https://www.anthropic.com/claude"},
35
+ "claude-3-sonnet-20240229": {"pretty_name": "Claude 3 Sonnet", "hf_model_id": "https://www.anthropic.com/claude"},
36
+ "zephyr-7b-gemma-v0.1": {"pretty_name": "Zephyr-7b-Gemma", "hf_model_id": "HuggingFaceH4/zephyr-7b-gemma-v0.1"},
37
+ "Starling-LM-7B-beta": {"pretty_name": "StarlingLM-7B-beta", "hf_model_id": "Nexusflow/Starling-LM-7B-beta"},
38
+ "dbrx-instruct": {"pretty_name": "DBRX Instruct", "hf_model_id": "databricks/dbrx-instruct"}
39
+ }
sotopia_space/utils.py CHANGED
@@ -1,3 +1,39 @@
 
 
1
 
2
- def apply_length_penalty(original_df, ablation_df, length_penalty=0.2, mode='v1', LP_original_dfs=None):
3
- return original_df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from ui_constants import column_names
2
+ from sotopia_space.constants import MODEL_INFO
3
 
4
+
5
+ def make_clickable_model(model_name):
6
+ global MODEL_INFO
7
+ if model_name in MODEL_INFO:
8
+ if MODEL_INFO[model_name]["hf_model_id"].startswith("http"):
9
+ link = MODEL_INFO[model_name]["hf_model_id"]
10
+ return f'🔒 <a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{MODEL_INFO[model_name]["pretty_name"]}</a>'
11
+ else:
12
+ link = f"https://huggingface.co/{MODEL_INFO[model_name]['hf_model_id']}"
13
+ return f'🔥 <a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{MODEL_INFO[model_name]["pretty_name"]}</a>'
14
+ else:
15
+ return model_name
16
+
17
+ # Formats the columns
18
+ def formatter(x):
19
+ if type(x) is str:
20
+ x = x
21
+ else:
22
+ x = round(x, 2)
23
+ return x
24
+
25
+ def post_processing(df, model_len_info):
26
+ if model_len_info:
27
+ df["Length"] = df["model_name"].apply(lambda x: model_len_info[x]["avg_len"])
28
+
29
+ for col in df.columns:
30
+ if col == "model_name":
31
+ df[col] = df[col].apply(lambda x: x.replace(x, make_clickable_model(x)))
32
+ else:
33
+ df[col] = df[col].apply(formatter) # For numerical values
34
+ df.rename(columns=column_names, inplace=True)
35
+ df.sort_values(by="GOAL [0, 10]", inplace=True, ascending=False)
36
+ # put the "Overall Elo" and "Task-Avg Elo" column to the front
37
+ # add the length info
38
+ df = df[["model_name", "GOAL [0, 10]"] + [col for col in df.columns if col not in ["model_name", "GOAL [0, 10]"]]]
39
+ return df