Yeoni Rhee commited on
Commit
a987038
·
1 Parent(s): ac011bc

Revised Sotopia Task Submission

Browse files
sotopia_space/chat.py CHANGED
@@ -91,8 +91,6 @@ def chat_introduction():
91
  🗄️ **Disclaimer**: User prompts and generated replies from the model may be collected solely for the purpose of pure academic research. By using this demo, users implicitly agree to these terms.
92
  """
93
  )
94
- # with gr.Column(scale=1):
95
- # toggle_dark = gr.Button(value="Toggle Dark")
96
 
97
  def create_user_agent_dropdown(environment_id):
98
  _, environment_dict, agent_dict, relationship_dict = get_sotopia_profiles()
 
91
  🗄️ **Disclaimer**: User prompts and generated replies from the model may be collected solely for the purpose of pure academic research. By using this demo, users implicitly agree to these terms.
92
  """
93
  )
 
 
94
 
95
  def create_user_agent_dropdown(environment_id):
96
  _, environment_dict, agent_dict, relationship_dict = get_sotopia_profiles()
sotopia_space/constants.py CHANGED
@@ -12,28 +12,3 @@ MODEL_OPTIONS = [
12
  # "together_ai/togethercomputer/llama-2-7b-chat",
13
  # "together_ai/togethercomputer/falcon-7b-instruct",
14
  ]
15
-
16
- MODEL_INFO = {
17
- "Llama-2-13b-chat-hf.nosp": {"pretty_name": "Llama-2-13B-chat", "hf_model_id": "meta-llama/Llama-2-13b-chat-hf"},
18
- "Llama-2-70b-chat-hf.nosp": {"pretty_name": "Llama-2-70B-chat", "hf_model_id": "meta-llama/Llama-2-70b-chat-hf"},
19
- "Llama-2-7b-chat-hf.nosp": {"pretty_name": "Llama-2-7B-chat", "hf_model_id": "meta-llama/Llama-2-7b-chat-hf"},
20
- "Llama-2-7b-chat-hf": {"pretty_name": "Llama-2-7B-chat (+sys prmpt)", "hf_model_id": "meta-llama/Llama-2-7b-chat-hf"},
21
- "Mistral-7B-Instruct-v0.1": {"pretty_name": "Mistral-7B-Instruct", "hf_model_id": "mistralai/Mistral-7B-Instruct-v0.1"},
22
- "Mistral-7B-Instruct-v0.2": {"pretty_name": "Mistral-7B-Instruct (v0.2)", "hf_model_id": "mistralai/Mistral-7B-Instruct-v0.2"},
23
- "Mixtral-8x7B-Instruct-v0.1": {"pretty_name": "Mixtral-8x7B-Instruct", "hf_model_id": "mistralai/Mixtral-8x7B-Instruct-v0.1"},
24
- "Nous-Hermes-2-Mixtral-8x7B-DPO": {"pretty_name": "Nous-Hermes-2-Mixtral-8x7B-DPO", "hf_model_id": "NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO"},
25
- "Yi-34B-Chat": {"pretty_name": "Yi-34B-Chat", "hf_model_id": "01-ai/Yi-34B"},
26
- "gemini-1.0-pro": {"pretty_name": "gemini-1.0-pro", "hf_model_id": "https://blog.google/technology/ai/google-gemini-ai/"},
27
- "gemma-7b-it": {"pretty_name": "Gemma-7B-it", "hf_model_id": "google/gemma-7b"},
28
- "gpt-3.5-turbo-0125": {"pretty_name": "gpt-3.5-turbo-0125", "hf_model_id": "https://platform.openai.com/"},
29
- "gpt-4-0125-preview": {"pretty_name": "gpt-4-0125-preview", "hf_model_id": "https://platform.openai.com/"},
30
- "tulu-2-dpo-70b": {"pretty_name": "Tulu-2-dpo-70b", "hf_model_id": "cmu-lti/tulu-2-dpo-70b"},
31
- "vicuna-13b-v1.5": {"pretty_name": "Vicuna-13b-v1.5", "hf_model_id": "lmsys/vicuna-13b-v1.5"},
32
- "zephyr-7b-beta": {"pretty_name": "Zephyr-7b-beta", "hf_model_id": "HuggingFaceH4/zephyr-7b-beta"},
33
- "mistral-large-2402": {"pretty_name": "Mistral-Large", "hf_model_id": "https://mistral.ai/news/mistral-large/"},
34
- "claude-3-opus-20240229": {"pretty_name": "Claude 3 Opus", "hf_model_id": "https://www.anthropic.com/claude"},
35
- "claude-3-sonnet-20240229": {"pretty_name": "Claude 3 Sonnet", "hf_model_id": "https://www.anthropic.com/claude"},
36
- "zephyr-7b-gemma-v0.1": {"pretty_name": "Zephyr-7b-Gemma", "hf_model_id": "HuggingFaceH4/zephyr-7b-gemma-v0.1"},
37
- "Starling-LM-7B-beta": {"pretty_name": "StarlingLM-7B-beta", "hf_model_id": "Nexusflow/Starling-LM-7B-beta"},
38
- "dbrx-instruct": {"pretty_name": "DBRX Instruct", "hf_model_id": "databricks/dbrx-instruct"}
39
- }
 
12
  # "together_ai/togethercomputer/llama-2-7b-chat",
13
  # "together_ai/togethercomputer/falcon-7b-instruct",
14
  ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
sotopia_space/utils.py CHANGED
@@ -1,223 +1,3 @@
1
- from datasets import load_dataset, Dataset
2
- import os
3
- import json
4
- from datasets import load_dataset
5
- from datasets.utils.logging import disable_progress_bar # type: ignore
6
- from ui_constants import column_names, all_task_types
7
- import random
8
- disable_progress_bar()
9
- import math
10
- from sotopia_space.constants import MODEL_INFO
11
-
12
- id_to_data = None
13
- model_len_info = None
14
-
15
-
16
- def make_clickable_model(model_name):
17
- global MODEL_INFO
18
- if model_name in MODEL_INFO:
19
- if MODEL_INFO[model_name]["hf_model_id"].startswith("http"):
20
- link = MODEL_INFO[model_name]["hf_model_id"]
21
- return f'🔒 <a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{MODEL_INFO[model_name]["pretty_name"]}</a>'
22
- else:
23
- link = f"https://huggingface.co/{MODEL_INFO[model_name]['hf_model_id']}"
24
- return f'🔥 <a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{MODEL_INFO[model_name]["pretty_name"]}</a>'
25
- else:
26
- return model_name
27
-
28
-
29
- def styled_error(error):
30
- return f"<p style='color: red; font-size: 20px; text-align: center;'>{error}</p>"
31
-
32
- def styled_warning(warn):
33
- return f"<p style='color: orange; font-size: 20px; text-align: center;'>{warn}</p>"
34
-
35
- def styled_message(message):
36
- return f"<p style='color: green; font-size: 20px; text-align: center;'>{message}</p>"
37
-
38
-
39
- def estimated_win_rate(elo_a, elo_b, LP=0):
40
- """
41
- Calculate the estimated win rate for player A against player B using their Elo ratings.
42
- :param elo_a: Elo rating of player A
43
- :param elo_b: Elo rating of player B
44
- :return: Estimated win rate for player A
45
- """
46
- exponent = (elo_b - elo_a)*(10**LP) / 400
47
- probability_a_wins = 1 / (1 + 10 ** exponent)
48
- return (1-probability_a_wins)*100
49
-
50
-
51
-
52
- # Formats the columns
53
- def formatter(x):
54
- if type(x) is str:
55
- x = x
56
- else:
57
- x = round(x, 1)
58
- return x
59
-
60
-
61
- def add_winrates(current_df, LP=0):
62
- df = current_df.copy()
63
- elo_column = "Task-Avg Elo"
64
-
65
- # Correct way to filter the DataFrame and get the Elo rating for "gpt-4-0125-preview"
66
- model_a_elo = df[df["Model"].str.contains("gpt-4")][elo_column].iloc[0]
67
-
68
- # Correct way to filter the DataFrame and get the Elo rating for "gpt-3.5-turbo-0125"
69
- model_b_elo = df[df["Model"].str.contains("gpt-3.5")][elo_column].iloc[0]
70
-
71
-
72
- # Calculate the win rate of "gpt-4-0125-preview" against all models
73
- df['Win% vs GPT-4'] = df[elo_column].apply(lambda x: estimated_win_rate(model_a_elo, x, LP=LP)).apply(formatter)
74
- df['Win% vs GPT-3.5T'] = df[elo_column].apply(lambda x: estimated_win_rate(model_b_elo, x, LP=LP)).apply(formatter)
75
- # apply the formatter for the two new columns
76
- cols = list(df.columns)
77
- cols.remove("# battles"); cols.append("# battles")
78
- cols.remove("Length"); cols.append("Length")
79
- df = df[cols]
80
- return df
81
-
82
- def add_winrates_tasks(current_df, ref="gpt-4", LP=0):
83
- new_df = current_df.copy()
84
- for t in all_task_types:
85
- column = column_names[t]
86
- model_a_elo = current_df[current_df["Model"].str.contains(ref)][column].iloc[0]
87
- new_df[column] = current_df[column].apply(lambda x: estimated_win_rate(model_a_elo, x, LP=LP)).apply(formatter)
88
- return new_df
89
-
90
-
91
- def post_processing(df, model_len_info):
92
- if model_len_info:
93
- df["Length"] = df["model name "].apply(lambda x: model_len_info[x]["avg_len"])
94
-
95
- for col in df.columns:
96
- if col == "model name ":
97
- df[col] = df[col].apply(lambda x: x.replace(x, make_clickable_model(x)))
98
- else:
99
- df[col] = df[col].apply(formatter) # For numerical values
100
- df.rename(columns=column_names, inplace=True)
101
- df.sort_values(by="Task-Avg Elo", inplace=True, ascending=False)
102
- # put the "Overall Elo" and "Task-Avg Elo" column to the front
103
- # add the length info
104
- df = df[["Model", "Task-Avg Elo"] + [col for col in df.columns if col not in ["Model", "Task-Avg Elo"]]]
105
- return df
106
 
107
  def apply_length_penalty(original_df, ablation_df, length_penalty=0.2, mode='v1', LP_original_dfs=None):
108
- """
109
- Temporarily disable the length penalty feature
110
- if mode == 'v2' and LP_original_dfs is not None:
111
- L = f"{length_penalty:.1f}"
112
- return LP_original_dfs[L]
113
- original_df = original_df.copy()
114
- ablation_df = ablation_df.copy()
115
- # replace all values in original_df with the values as z = x - y * length_penalty where y is from ablation_df at the same row and column
116
- # except for the "Model" column and the "# battles" column
117
- # do not assume the order of the rows are the same in both dataframes
118
- for i, row in original_df.iterrows():
119
- for col in original_df.columns:
120
- if col == "Model" or col == "# battles" or col == "Length":
121
- continue
122
- # assert that the model names are the same in both dataframes
123
- assert original_df.at[i, "Model"] == ablation_df[ablation_df["Model"] == row["Model"]]["Model"].values[0]
124
- original_df[col] = original_df[col].astype(float)
125
- if mode == "v1":
126
- original_df.at[i, col] = original_df.at[i, col] - ablation_df[ablation_df["Model"] == row["Model"]][col].values[0] * length_penalty
127
- elif mode == "v1.1":
128
- diff = original_df.at[i, col] - ablation_df[ablation_df["Model"] == row["Model"]][col].values[0]
129
- original_df.at[i, col] = original_df.at[i, col] * (1-length_penalty) + diff*length_penalty
130
- # post_processing
131
- original_df = post_processing(original_df, model_len_info=None)
132
- """
133
  return original_df
134
-
135
- def load_benchdata():
136
- print("Loading sotopia data...")
137
- bench_data = load_dataset("cmu-lti/sotopia", split="test")
138
- return bench_data
139
-
140
- def load_benchdata_dict():
141
- print("Loading sotopia data....")
142
- bench_data = load_dataset("cmu-lti/sotopia", data_files="sotopia_episodes_v1_hf.jsonl")['train']
143
- id_to_data = {}
144
- for item in bench_data:
145
- id_to_data[item["session_id"]] = item
146
- return id_to_data
147
-
148
- def load_eval_results():
149
- print("Loading sotopia Evaluation data...")
150
- eval_results = load_dataset("WildEval/sotopia-Evaluation", "all", split="train")
151
- return eval_results
152
-
153
- def load_infer_results(model_name):
154
- print(f"Loading sotopia Results for {model_name}...")
155
- infer_results = load_dataset("WildEval/sotopia-Results", model_name, split="train")
156
- return infer_results
157
-
158
- def sample_an_eval_result(eval_results, model_list=[], tag_list=[]):
159
- global id_to_data
160
- eval_results = list(eval_results)
161
- random.shuffle(eval_results)
162
- for eval_item in eval_results:
163
- # print(json.dumps(eval_item, indent=2))
164
- # print(f"## Session ID: {eval_item['session_id']}")
165
- # eval_item["eval_id"]
166
- assignment = eval_item['assignment']
167
- model_1, model_2 = eval_item['model_1'], eval_item['model_2']
168
- model_A = model_1 if assignment['A'] == model_1 else model_2
169
- model_B = model_2 if assignment['B'] == model_2 else model_1
170
- if len(model_list) >= 2:
171
- if model_A not in model_list or model_B not in model_list:
172
- continue
173
- elif len(model_list) == 1:
174
- if model_A != model_list[0] and model_B != model_list[0]:
175
- continue
176
- else:
177
- pass
178
- if tag_list:
179
- if set(tag_list).isdisjoint(set(eval_item['tags'])):
180
- continue
181
- winner = eval_item['winner']
182
- # print(f"## Model A: {model_A} | Model B: {model_B} | Winner: {winner}")
183
- task_type = eval_item['tags'][0] # primary task type
184
- chat_history = eval_item['history']
185
- last_query = eval_item['last_query']
186
- # print(f"## Task Type: {task_type}")
187
- # print(f"## Chat History: {chat_history}")
188
- # print(f"## Last Query --> USER: {last_query}")
189
-
190
- model_A_output = eval_item['model_1_output'] if model_1 == model_A else eval_item['model_2_output']
191
- model_B_output = eval_item['model_2_output'] if model_2 == model_B else eval_item['model_1_output']
192
-
193
- if len(model_A_output.strip()) == 0 or len(model_B_output.strip()) == 0:
194
- continue
195
-
196
- conversation_input = id_to_data[eval_item['session_id']]["conversation_input"]
197
- # print(f"\n\n\n## Model A ({model_A}) Output ##\n{model_A_output}")
198
- # print(f"\n\n\n## Model B ({model_B}) Output ##\n{model_B_output}")
199
-
200
- # print(f"\n\n\n## Winner ##\n{winner}")
201
- # print(f"\n\n\n## GPT-4 Judgement ##\n{eval_item['parsed_result']}")
202
-
203
- result_dict = {
204
- "session_id": eval_item['session_id'],
205
- "model_A": model_A,
206
- "model_B": model_B,
207
- "winner": winner,
208
- "intent": id_to_data[eval_item['session_id']]["intent"],
209
- "task_type": task_type,
210
- "all_tags": eval_item['tags'],
211
- "chat_history": chat_history,
212
- "last_query": last_query,
213
- "conversation_input": conversation_input,
214
- "model_A_output": model_A_output,
215
- "model_B_output": model_B_output,
216
- "reason": eval_item['parsed_result']["reason"],
217
- "choice": eval_item['parsed_result']["choice"],
218
- "checklist": id_to_data[eval_item['session_id']]["checklist"],
219
- }
220
- break
221
- return result_dict
222
-
223
- #id_to_data = load_benchdata_dict()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
 
2
  def apply_length_penalty(original_df, ablation_df, length_penalty=0.2, mode='v1', LP_original_dfs=None):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  return original_df