Xuhui commited on
Commit
e2fad4e
·
unverified ·
2 Parent(s): e140a93 aa9be2a

Merge pull request #64 from yeonir/yeonirhee/sotopia-task

Browse files
sotopia_space/benchmark.py CHANGED
@@ -1,42 +1,13 @@
1
  import gradio as gr # type: ignore
2
  import pandas as pd
3
  from sotopia_space.constants import MODEL_OPTIONS
4
- from sotopia_space.utils import estimated_win_rate, make_clickable_model, styled_error, styled_warning, styled_message,apply_length_penalty
5
 
6
  LP_MODE = "v2"
7
  original_df, ablation_df = None, None
8
  LP_original_dfs = {}
9
  DEFAULT_LP = 0.5
10
 
11
- available_models = [] # to be filled in later
12
- original_df, ablation_df = None, None
13
-
14
- def slider_change_main(length_penalty):
15
- global original_df, ablation_df, LP_MODE
16
- adjusted_df = apply_length_penalty(original_df, ablation_df, length_penalty, mode=LP_MODE, LP_original_dfs=LP_original_dfs)
17
- adjusted_df = adjusted_df[["Model", "Overall Elo", "Task-Avg Elo", "# battles", "Length"]]
18
- adjusted_df = adjusted_df.sort_values(by="Overall Elo", ascending=False)
19
- # adjusted_df = add_winrates(adjusted_df, LP=length_penalty)
20
- # adjusted_df = adjusted_df.drop(columns=["Length"])
21
- adjusted_df.insert(0, "Rank", range(1, 1 + len(adjusted_df)))
22
- return adjusted_df
23
-
24
- def slider_change_full(length_penalty, show_winrate):
25
- global original_df, ablation_df, LP_MODE
26
- adjusted_df = apply_length_penalty(original_df, ablation_df, length_penalty, mode=LP_MODE, LP_original_dfs=LP_original_dfs)
27
- # sort the model by the "Task-Avg Elo" column
28
- adjusted_df = adjusted_df.sort_values(by="Overall Elo", ascending=False)
29
- adjusted_df.drop(columns=["Overall Elo", "Task-Avg Elo", "# battles", "Length"], inplace=True)
30
- if show_winrate == "none":
31
- adjusted_df.insert(0, "Rank", range(1, 1 + len(adjusted_df)))
32
- return adjusted_df
33
- elif show_winrate == "gpt-3.5":
34
- adjusted_df = add_winrates_tasks(adjusted_df, ref="gpt-3.5", LP=length_penalty)
35
- elif show_winrate == "gpt-4":
36
- adjusted_df = add_winrates_tasks(adjusted_df, ref="gpt-4", LP=length_penalty)
37
- adjusted_df.insert(0, "Rank", range(1, 1 + len(adjusted_df)))
38
- return adjusted_df
39
-
40
  def benchmark_table():
41
  global original_df, ablation_df
42
  global LP_original_dfs, LP_MODE
@@ -44,18 +15,15 @@ def benchmark_table():
44
  gr.Markdown(f"**Version**: sotopia (v1.01; 2024.04.22) | **# Examples**: 7200 | **# Models**: {len(MODEL_OPTIONS)} | **# Comparisons**: x", elem_classes="markdown-text")
45
 
46
  with gr.TabItem("Vs GPT-3.5", elem_id="od-benchmark-tab-table-ablation", id=0, elem_classes="subtab"):
47
- # original_df, ablation_df = skip_empty_original_df, skip_empty_ablation_df
48
- original_df = pd.read_json('data_dir/models_vs_gpt35.jsonl', lines=True)
49
- default_main_df = apply_length_penalty(original_df, ablation_df, length_penalty=DEFAULT_LP, mode=LP_MODE, LP_original_dfs=LP_original_dfs)
50
  default_main_df = default_main_df.sort_values(by="GOAL [0, 10]", ascending=False)
 
51
  # add a Rank column to the first columnn (starting from 1)
52
  default_main_df.insert(0, "Rank", range(1, 1 + len(default_main_df)))
 
53
  with gr.Row():
54
  with gr.Column(scale=4):
55
- gr.Markdown("**Vs GPT3.5**: The interlocutors are compared against GPT-3.5, the baseline model.")
56
- with gr.Column(scale=1):
57
- length_penlty_slider = gr.Slider(minimum=0.1, maximum=1, step=0.1, value=DEFAULT_LP, label="Length Penalty", elem_id="length-penalty-slider")
58
- # checkbox_skip_empty = gr.Checkbox(label="Skip empty results", value=False, elem_id="skip-empty-checkbox", scale=2)
59
  TYPES = ["number", "markdown", "number"]
60
  leaderboard_table = gr.components.Dataframe(
61
  value=default_main_df,
@@ -66,5 +34,4 @@ def benchmark_table():
66
  interactive=False,
67
  visible=True,
68
  min_width=60,
69
- )
70
- #length_penlty_slider.change(fn=slider_change_main, inputs=[length_penlty_slider], outputs=[leaderboard_table])
 
1
  import gradio as gr # type: ignore
2
  import pandas as pd
3
  from sotopia_space.constants import MODEL_OPTIONS
4
+ from sotopia_space.utils import post_processing
5
 
6
  LP_MODE = "v2"
7
  original_df, ablation_df = None, None
8
  LP_original_dfs = {}
9
  DEFAULT_LP = 0.5
10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  def benchmark_table():
12
  global original_df, ablation_df
13
  global LP_original_dfs, LP_MODE
 
15
  gr.Markdown(f"**Version**: sotopia (v1.01; 2024.04.22) | **# Examples**: 7200 | **# Models**: {len(MODEL_OPTIONS)} | **# Comparisons**: x", elem_classes="markdown-text")
16
 
17
  with gr.TabItem("Vs GPT-3.5", elem_id="od-benchmark-tab-table-ablation", id=0, elem_classes="subtab"):
18
+ default_main_df = pd.read_json('data_dir/models_vs_gpt35.jsonl', lines=True)
 
 
19
  default_main_df = default_main_df.sort_values(by="GOAL [0, 10]", ascending=False)
20
+ default_main_df = post_processing(default_main_df, None)
21
  # add a Rank column to the first columnn (starting from 1)
22
  default_main_df.insert(0, "Rank", range(1, 1 + len(default_main_df)))
23
+
24
  with gr.Row():
25
  with gr.Column(scale=4):
26
+ gr.Markdown("<h3>**Vs GPT3.5**: The interlocutors are compared against GPT-3.5, the baseline model.")
 
 
 
27
  TYPES = ["number", "markdown", "number"]
28
  leaderboard_table = gr.components.Dataframe(
29
  value=default_main_df,
 
34
  interactive=False,
35
  visible=True,
36
  min_width=60,
37
+ )
 
sotopia_space/chat.py CHANGED
@@ -91,8 +91,6 @@ def chat_introduction():
91
  🗄️ **Disclaimer**: User prompts and generated replies from the model may be collected solely for the purpose of pure academic research. By using this demo, users implicitly agree to these terms.
92
  """
93
  )
94
- # with gr.Column(scale=1):
95
- # toggle_dark = gr.Button(value="Toggle Dark")
96
 
97
  def create_user_agent_dropdown(environment_id):
98
  _, environment_dict, agent_dict, relationship_dict = get_sotopia_profiles()
 
91
  🗄️ **Disclaimer**: User prompts and generated replies from the model may be collected solely for the purpose of pure academic research. By using this demo, users implicitly agree to these terms.
92
  """
93
  )
 
 
94
 
95
  def create_user_agent_dropdown(environment_id):
96
  _, environment_dict, agent_dict, relationship_dict = get_sotopia_profiles()
sotopia_space/constants.py CHANGED
@@ -14,26 +14,8 @@ MODEL_OPTIONS = [
14
  ]
15
 
16
  MODEL_INFO = {
17
- "Llama-2-13b-chat-hf.nosp": {"pretty_name": "Llama-2-13B-chat", "hf_model_id": "meta-llama/Llama-2-13b-chat-hf"},
18
- "Llama-2-70b-chat-hf.nosp": {"pretty_name": "Llama-2-70B-chat", "hf_model_id": "meta-llama/Llama-2-70b-chat-hf"},
19
- "Llama-2-7b-chat-hf.nosp": {"pretty_name": "Llama-2-7B-chat", "hf_model_id": "meta-llama/Llama-2-7b-chat-hf"},
20
- "Llama-2-7b-chat-hf": {"pretty_name": "Llama-2-7B-chat (+sys prmpt)", "hf_model_id": "meta-llama/Llama-2-7b-chat-hf"},
21
- "Mistral-7B-Instruct-v0.1": {"pretty_name": "Mistral-7B-Instruct", "hf_model_id": "mistralai/Mistral-7B-Instruct-v0.1"},
22
- "Mistral-7B-Instruct-v0.2": {"pretty_name": "Mistral-7B-Instruct (v0.2)", "hf_model_id": "mistralai/Mistral-7B-Instruct-v0.2"},
23
- "Mixtral-8x7B-Instruct-v0.1": {"pretty_name": "Mixtral-8x7B-Instruct", "hf_model_id": "mistralai/Mixtral-8x7B-Instruct-v0.1"},
24
- "Nous-Hermes-2-Mixtral-8x7B-DPO": {"pretty_name": "Nous-Hermes-2-Mixtral-8x7B-DPO", "hf_model_id": "NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO"},
25
- "Yi-34B-Chat": {"pretty_name": "Yi-34B-Chat", "hf_model_id": "01-ai/Yi-34B"},
26
- "gemini-1.0-pro": {"pretty_name": "gemini-1.0-pro", "hf_model_id": "https://blog.google/technology/ai/google-gemini-ai/"},
27
- "gemma-7b-it": {"pretty_name": "Gemma-7B-it", "hf_model_id": "google/gemma-7b"},
28
- "gpt-3.5-turbo-0125": {"pretty_name": "gpt-3.5-turbo-0125", "hf_model_id": "https://platform.openai.com/"},
29
- "gpt-4-0125-preview": {"pretty_name": "gpt-4-0125-preview", "hf_model_id": "https://platform.openai.com/"},
30
- "tulu-2-dpo-70b": {"pretty_name": "Tulu-2-dpo-70b", "hf_model_id": "cmu-lti/tulu-2-dpo-70b"},
31
- "vicuna-13b-v1.5": {"pretty_name": "Vicuna-13b-v1.5", "hf_model_id": "lmsys/vicuna-13b-v1.5"},
32
- "zephyr-7b-beta": {"pretty_name": "Zephyr-7b-beta", "hf_model_id": "HuggingFaceH4/zephyr-7b-beta"},
33
- "mistral-large-2402": {"pretty_name": "Mistral-Large", "hf_model_id": "https://mistral.ai/news/mistral-large/"},
34
- "claude-3-opus-20240229": {"pretty_name": "Claude 3 Opus", "hf_model_id": "https://www.anthropic.com/claude"},
35
- "claude-3-sonnet-20240229": {"pretty_name": "Claude 3 Sonnet", "hf_model_id": "https://www.anthropic.com/claude"},
36
- "zephyr-7b-gemma-v0.1": {"pretty_name": "Zephyr-7b-Gemma", "hf_model_id": "HuggingFaceH4/zephyr-7b-gemma-v0.1"},
37
- "Starling-LM-7B-beta": {"pretty_name": "StarlingLM-7B-beta", "hf_model_id": "Nexusflow/Starling-LM-7B-beta"},
38
- "dbrx-instruct": {"pretty_name": "DBRX Instruct", "hf_model_id": "databricks/dbrx-instruct"}
39
  }
 
14
  ]
15
 
16
  MODEL_INFO = {
17
+ "GPT-4": {"pretty_name": "GPT-4", "hf_model_id": "https://openai.com/blog/new-models-and-developer-products-announced-at-devday"},
18
+ "GPT-3.5": {"pretty_name": "GPT-3.5", "hf_model_id": "https://openai.com/blog/new-models-and-developer-products-announced-at-devday"},
19
+ "Llama-2": {"pretty_name": "Llama-2", "hf_model_id": "https://llama.meta.com/llama2/"},
20
+ "MPT": {"pretty_name": "MPT", "hf_model_id": "https://huggingface.co/docs/transformers/main/en/model_doc/mpt"}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  }
sotopia_space/utils.py CHANGED
@@ -1,17 +1,6 @@
1
- from datasets import load_dataset, Dataset
2
- import os
3
- import json
4
- from datasets import load_dataset
5
- from datasets.utils.logging import disable_progress_bar # type: ignore
6
- from ui_constants import column_names, all_task_types
7
- import random
8
- disable_progress_bar()
9
- import math
10
  from sotopia_space.constants import MODEL_INFO
11
 
12
- id_to_data = None
13
- model_len_info = None
14
-
15
 
16
  def make_clickable_model(model_name):
17
  global MODEL_INFO
@@ -25,199 +14,26 @@ def make_clickable_model(model_name):
25
  else:
26
  return model_name
27
 
28
-
29
- def styled_error(error):
30
- return f"<p style='color: red; font-size: 20px; text-align: center;'>{error}</p>"
31
-
32
- def styled_warning(warn):
33
- return f"<p style='color: orange; font-size: 20px; text-align: center;'>{warn}</p>"
34
-
35
- def styled_message(message):
36
- return f"<p style='color: green; font-size: 20px; text-align: center;'>{message}</p>"
37
-
38
-
39
- def estimated_win_rate(elo_a, elo_b, LP=0):
40
- """
41
- Calculate the estimated win rate for player A against player B using their Elo ratings.
42
- :param elo_a: Elo rating of player A
43
- :param elo_b: Elo rating of player B
44
- :return: Estimated win rate for player A
45
- """
46
- exponent = (elo_b - elo_a)*(10**LP) / 400
47
- probability_a_wins = 1 / (1 + 10 ** exponent)
48
- return (1-probability_a_wins)*100
49
-
50
-
51
-
52
  # Formats the columns
53
  def formatter(x):
54
  if type(x) is str:
55
  x = x
56
  else:
57
- x = round(x, 1)
58
  return x
59
 
60
-
61
- def add_winrates(current_df, LP=0):
62
- df = current_df.copy()
63
- elo_column = "Task-Avg Elo"
64
-
65
- # Correct way to filter the DataFrame and get the Elo rating for "gpt-4-0125-preview"
66
- model_a_elo = df[df["Model"].str.contains("gpt-4")][elo_column].iloc[0]
67
-
68
- # Correct way to filter the DataFrame and get the Elo rating for "gpt-3.5-turbo-0125"
69
- model_b_elo = df[df["Model"].str.contains("gpt-3.5")][elo_column].iloc[0]
70
-
71
-
72
- # Calculate the win rate of "gpt-4-0125-preview" against all models
73
- df['Win% vs GPT-4'] = df[elo_column].apply(lambda x: estimated_win_rate(model_a_elo, x, LP=LP)).apply(formatter)
74
- df['Win% vs GPT-3.5T'] = df[elo_column].apply(lambda x: estimated_win_rate(model_b_elo, x, LP=LP)).apply(formatter)
75
- # apply the formatter for the two new columns
76
- cols = list(df.columns)
77
- cols.remove("# battles"); cols.append("# battles")
78
- cols.remove("Length"); cols.append("Length")
79
- df = df[cols]
80
- return df
81
-
82
- def add_winrates_tasks(current_df, ref="gpt-4", LP=0):
83
- new_df = current_df.copy()
84
- for t in all_task_types:
85
- column = column_names[t]
86
- model_a_elo = current_df[current_df["Model"].str.contains(ref)][column].iloc[0]
87
- new_df[column] = current_df[column].apply(lambda x: estimated_win_rate(model_a_elo, x, LP=LP)).apply(formatter)
88
- return new_df
89
-
90
-
91
  def post_processing(df, model_len_info):
92
  if model_len_info:
93
- df["Length"] = df["model name "].apply(lambda x: model_len_info[x]["avg_len"])
94
 
95
  for col in df.columns:
96
- if col == "model name ":
97
  df[col] = df[col].apply(lambda x: x.replace(x, make_clickable_model(x)))
98
  else:
99
  df[col] = df[col].apply(formatter) # For numerical values
100
  df.rename(columns=column_names, inplace=True)
101
- df.sort_values(by="Task-Avg Elo", inplace=True, ascending=False)
102
  # put the "Overall Elo" and "Task-Avg Elo" column to the front
103
  # add the length info
104
- df = df[["Model", "Task-Avg Elo"] + [col for col in df.columns if col not in ["Model", "Task-Avg Elo"]]]
105
  return df
106
-
107
- def apply_length_penalty(original_df, ablation_df, length_penalty=0.2, mode='v1', LP_original_dfs=None):
108
- """
109
- Temporarily disable the length penalty feature
110
- if mode == 'v2' and LP_original_dfs is not None:
111
- L = f"{length_penalty:.1f}"
112
- return LP_original_dfs[L]
113
- original_df = original_df.copy()
114
- ablation_df = ablation_df.copy()
115
- # replace all values in original_df with the values as z = x - y * length_penalty where y is from ablation_df at the same row and column
116
- # except for the "Model" column and the "# battles" column
117
- # do not assume the order of the rows are the same in both dataframes
118
- for i, row in original_df.iterrows():
119
- for col in original_df.columns:
120
- if col == "Model" or col == "# battles" or col == "Length":
121
- continue
122
- # assert that the model names are the same in both dataframes
123
- assert original_df.at[i, "Model"] == ablation_df[ablation_df["Model"] == row["Model"]]["Model"].values[0]
124
- original_df[col] = original_df[col].astype(float)
125
- if mode == "v1":
126
- original_df.at[i, col] = original_df.at[i, col] - ablation_df[ablation_df["Model"] == row["Model"]][col].values[0] * length_penalty
127
- elif mode == "v1.1":
128
- diff = original_df.at[i, col] - ablation_df[ablation_df["Model"] == row["Model"]][col].values[0]
129
- original_df.at[i, col] = original_df.at[i, col] * (1-length_penalty) + diff*length_penalty
130
- # post_processing
131
- original_df = post_processing(original_df, model_len_info=None)
132
- """
133
- return original_df
134
-
135
- def load_benchdata():
136
- print("Loading sotopia data...")
137
- bench_data = load_dataset("cmu-lti/sotopia", split="test")
138
- return bench_data
139
-
140
- def load_benchdata_dict():
141
- print("Loading sotopia data....")
142
- bench_data = load_dataset("cmu-lti/sotopia", data_files="sotopia_episodes_v1_hf.jsonl")['train']
143
- id_to_data = {}
144
- for item in bench_data:
145
- id_to_data[item["session_id"]] = item
146
- return id_to_data
147
-
148
- def load_eval_results():
149
- print("Loading sotopia Evaluation data...")
150
- eval_results = load_dataset("WildEval/sotopia-Evaluation", "all", split="train")
151
- return eval_results
152
-
153
- def load_infer_results(model_name):
154
- print(f"Loading sotopia Results for {model_name}...")
155
- infer_results = load_dataset("WildEval/sotopia-Results", model_name, split="train")
156
- return infer_results
157
-
158
- def sample_an_eval_result(eval_results, model_list=[], tag_list=[]):
159
- global id_to_data
160
- eval_results = list(eval_results)
161
- random.shuffle(eval_results)
162
- for eval_item in eval_results:
163
- # print(json.dumps(eval_item, indent=2))
164
- # print(f"## Session ID: {eval_item['session_id']}")
165
- # eval_item["eval_id"]
166
- assignment = eval_item['assignment']
167
- model_1, model_2 = eval_item['model_1'], eval_item['model_2']
168
- model_A = model_1 if assignment['A'] == model_1 else model_2
169
- model_B = model_2 if assignment['B'] == model_2 else model_1
170
- if len(model_list) >= 2:
171
- if model_A not in model_list or model_B not in model_list:
172
- continue
173
- elif len(model_list) == 1:
174
- if model_A != model_list[0] and model_B != model_list[0]:
175
- continue
176
- else:
177
- pass
178
- if tag_list:
179
- if set(tag_list).isdisjoint(set(eval_item['tags'])):
180
- continue
181
- winner = eval_item['winner']
182
- # print(f"## Model A: {model_A} | Model B: {model_B} | Winner: {winner}")
183
- task_type = eval_item['tags'][0] # primary task type
184
- chat_history = eval_item['history']
185
- last_query = eval_item['last_query']
186
- # print(f"## Task Type: {task_type}")
187
- # print(f"## Chat History: {chat_history}")
188
- # print(f"## Last Query --> USER: {last_query}")
189
-
190
- model_A_output = eval_item['model_1_output'] if model_1 == model_A else eval_item['model_2_output']
191
- model_B_output = eval_item['model_2_output'] if model_2 == model_B else eval_item['model_1_output']
192
-
193
- if len(model_A_output.strip()) == 0 or len(model_B_output.strip()) == 0:
194
- continue
195
-
196
- conversation_input = id_to_data[eval_item['session_id']]["conversation_input"]
197
- # print(f"\n\n\n## Model A ({model_A}) Output ##\n{model_A_output}")
198
- # print(f"\n\n\n## Model B ({model_B}) Output ##\n{model_B_output}")
199
-
200
- # print(f"\n\n\n## Winner ##\n{winner}")
201
- # print(f"\n\n\n## GPT-4 Judgement ##\n{eval_item['parsed_result']}")
202
-
203
- result_dict = {
204
- "session_id": eval_item['session_id'],
205
- "model_A": model_A,
206
- "model_B": model_B,
207
- "winner": winner,
208
- "intent": id_to_data[eval_item['session_id']]["intent"],
209
- "task_type": task_type,
210
- "all_tags": eval_item['tags'],
211
- "chat_history": chat_history,
212
- "last_query": last_query,
213
- "conversation_input": conversation_input,
214
- "model_A_output": model_A_output,
215
- "model_B_output": model_B_output,
216
- "reason": eval_item['parsed_result']["reason"],
217
- "choice": eval_item['parsed_result']["choice"],
218
- "checklist": id_to_data[eval_item['session_id']]["checklist"],
219
- }
220
- break
221
- return result_dict
222
-
223
- #id_to_data = load_benchdata_dict()
 
1
+ from ui_constants import column_names
 
 
 
 
 
 
 
 
2
  from sotopia_space.constants import MODEL_INFO
3
 
 
 
 
4
 
5
  def make_clickable_model(model_name):
6
  global MODEL_INFO
 
14
  else:
15
  return model_name
16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  # Formats the columns
18
  def formatter(x):
19
  if type(x) is str:
20
  x = x
21
  else:
22
+ x = round(x, 2)
23
  return x
24
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  def post_processing(df, model_len_info):
26
  if model_len_info:
27
+ df["Length"] = df["model_name"].apply(lambda x: model_len_info[x]["avg_len"])
28
 
29
  for col in df.columns:
30
+ if col == "model_name":
31
  df[col] = df[col].apply(lambda x: x.replace(x, make_clickable_model(x)))
32
  else:
33
  df[col] = df[col].apply(formatter) # For numerical values
34
  df.rename(columns=column_names, inplace=True)
35
+ df.sort_values(by="GOAL [0, 10]", inplace=True, ascending=False)
36
  # put the "Overall Elo" and "Task-Avg Elo" column to the front
37
  # add the length info
38
+ df = df[["model_name", "GOAL [0, 10]"] + [col for col in df.columns if col not in ["model_name", "GOAL [0, 10]"]]]
39
  return df