Spaces:
Runtime error
Runtime error
Yeoni Rhee
commited on
Commit
·
a987038
1
Parent(s):
ac011bc
Revised Sotopia Task Submission
Browse files- sotopia_space/chat.py +0 -2
- sotopia_space/constants.py +0 -25
- sotopia_space/utils.py +0 -220
sotopia_space/chat.py
CHANGED
@@ -91,8 +91,6 @@ def chat_introduction():
|
|
91 |
🗄️ **Disclaimer**: User prompts and generated replies from the model may be collected solely for the purpose of pure academic research. By using this demo, users implicitly agree to these terms.
|
92 |
"""
|
93 |
)
|
94 |
-
# with gr.Column(scale=1):
|
95 |
-
# toggle_dark = gr.Button(value="Toggle Dark")
|
96 |
|
97 |
def create_user_agent_dropdown(environment_id):
|
98 |
_, environment_dict, agent_dict, relationship_dict = get_sotopia_profiles()
|
|
|
91 |
🗄️ **Disclaimer**: User prompts and generated replies from the model may be collected solely for the purpose of pure academic research. By using this demo, users implicitly agree to these terms.
|
92 |
"""
|
93 |
)
|
|
|
|
|
94 |
|
95 |
def create_user_agent_dropdown(environment_id):
|
96 |
_, environment_dict, agent_dict, relationship_dict = get_sotopia_profiles()
|
sotopia_space/constants.py
CHANGED
@@ -12,28 +12,3 @@ MODEL_OPTIONS = [
|
|
12 |
# "together_ai/togethercomputer/llama-2-7b-chat",
|
13 |
# "together_ai/togethercomputer/falcon-7b-instruct",
|
14 |
]
|
15 |
-
|
16 |
-
MODEL_INFO = {
|
17 |
-
"Llama-2-13b-chat-hf.nosp": {"pretty_name": "Llama-2-13B-chat", "hf_model_id": "meta-llama/Llama-2-13b-chat-hf"},
|
18 |
-
"Llama-2-70b-chat-hf.nosp": {"pretty_name": "Llama-2-70B-chat", "hf_model_id": "meta-llama/Llama-2-70b-chat-hf"},
|
19 |
-
"Llama-2-7b-chat-hf.nosp": {"pretty_name": "Llama-2-7B-chat", "hf_model_id": "meta-llama/Llama-2-7b-chat-hf"},
|
20 |
-
"Llama-2-7b-chat-hf": {"pretty_name": "Llama-2-7B-chat (+sys prmpt)", "hf_model_id": "meta-llama/Llama-2-7b-chat-hf"},
|
21 |
-
"Mistral-7B-Instruct-v0.1": {"pretty_name": "Mistral-7B-Instruct", "hf_model_id": "mistralai/Mistral-7B-Instruct-v0.1"},
|
22 |
-
"Mistral-7B-Instruct-v0.2": {"pretty_name": "Mistral-7B-Instruct (v0.2)", "hf_model_id": "mistralai/Mistral-7B-Instruct-v0.2"},
|
23 |
-
"Mixtral-8x7B-Instruct-v0.1": {"pretty_name": "Mixtral-8x7B-Instruct", "hf_model_id": "mistralai/Mixtral-8x7B-Instruct-v0.1"},
|
24 |
-
"Nous-Hermes-2-Mixtral-8x7B-DPO": {"pretty_name": "Nous-Hermes-2-Mixtral-8x7B-DPO", "hf_model_id": "NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO"},
|
25 |
-
"Yi-34B-Chat": {"pretty_name": "Yi-34B-Chat", "hf_model_id": "01-ai/Yi-34B"},
|
26 |
-
"gemini-1.0-pro": {"pretty_name": "gemini-1.0-pro", "hf_model_id": "https://blog.google/technology/ai/google-gemini-ai/"},
|
27 |
-
"gemma-7b-it": {"pretty_name": "Gemma-7B-it", "hf_model_id": "google/gemma-7b"},
|
28 |
-
"gpt-3.5-turbo-0125": {"pretty_name": "gpt-3.5-turbo-0125", "hf_model_id": "https://platform.openai.com/"},
|
29 |
-
"gpt-4-0125-preview": {"pretty_name": "gpt-4-0125-preview", "hf_model_id": "https://platform.openai.com/"},
|
30 |
-
"tulu-2-dpo-70b": {"pretty_name": "Tulu-2-dpo-70b", "hf_model_id": "cmu-lti/tulu-2-dpo-70b"},
|
31 |
-
"vicuna-13b-v1.5": {"pretty_name": "Vicuna-13b-v1.5", "hf_model_id": "lmsys/vicuna-13b-v1.5"},
|
32 |
-
"zephyr-7b-beta": {"pretty_name": "Zephyr-7b-beta", "hf_model_id": "HuggingFaceH4/zephyr-7b-beta"},
|
33 |
-
"mistral-large-2402": {"pretty_name": "Mistral-Large", "hf_model_id": "https://mistral.ai/news/mistral-large/"},
|
34 |
-
"claude-3-opus-20240229": {"pretty_name": "Claude 3 Opus", "hf_model_id": "https://www.anthropic.com/claude"},
|
35 |
-
"claude-3-sonnet-20240229": {"pretty_name": "Claude 3 Sonnet", "hf_model_id": "https://www.anthropic.com/claude"},
|
36 |
-
"zephyr-7b-gemma-v0.1": {"pretty_name": "Zephyr-7b-Gemma", "hf_model_id": "HuggingFaceH4/zephyr-7b-gemma-v0.1"},
|
37 |
-
"Starling-LM-7B-beta": {"pretty_name": "StarlingLM-7B-beta", "hf_model_id": "Nexusflow/Starling-LM-7B-beta"},
|
38 |
-
"dbrx-instruct": {"pretty_name": "DBRX Instruct", "hf_model_id": "databricks/dbrx-instruct"}
|
39 |
-
}
|
|
|
12 |
# "together_ai/togethercomputer/llama-2-7b-chat",
|
13 |
# "together_ai/togethercomputer/falcon-7b-instruct",
|
14 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
sotopia_space/utils.py
CHANGED
@@ -1,223 +1,3 @@
|
|
1 |
-
from datasets import load_dataset, Dataset
|
2 |
-
import os
|
3 |
-
import json
|
4 |
-
from datasets import load_dataset
|
5 |
-
from datasets.utils.logging import disable_progress_bar # type: ignore
|
6 |
-
from ui_constants import column_names, all_task_types
|
7 |
-
import random
|
8 |
-
disable_progress_bar()
|
9 |
-
import math
|
10 |
-
from sotopia_space.constants import MODEL_INFO
|
11 |
-
|
12 |
-
id_to_data = None
|
13 |
-
model_len_info = None
|
14 |
-
|
15 |
-
|
16 |
-
def make_clickable_model(model_name):
|
17 |
-
global MODEL_INFO
|
18 |
-
if model_name in MODEL_INFO:
|
19 |
-
if MODEL_INFO[model_name]["hf_model_id"].startswith("http"):
|
20 |
-
link = MODEL_INFO[model_name]["hf_model_id"]
|
21 |
-
return f'🔒 <a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{MODEL_INFO[model_name]["pretty_name"]}</a>'
|
22 |
-
else:
|
23 |
-
link = f"https://huggingface.co/{MODEL_INFO[model_name]['hf_model_id']}"
|
24 |
-
return f'🔥 <a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{MODEL_INFO[model_name]["pretty_name"]}</a>'
|
25 |
-
else:
|
26 |
-
return model_name
|
27 |
-
|
28 |
-
|
29 |
-
def styled_error(error):
|
30 |
-
return f"<p style='color: red; font-size: 20px; text-align: center;'>{error}</p>"
|
31 |
-
|
32 |
-
def styled_warning(warn):
|
33 |
-
return f"<p style='color: orange; font-size: 20px; text-align: center;'>{warn}</p>"
|
34 |
-
|
35 |
-
def styled_message(message):
|
36 |
-
return f"<p style='color: green; font-size: 20px; text-align: center;'>{message}</p>"
|
37 |
-
|
38 |
-
|
39 |
-
def estimated_win_rate(elo_a, elo_b, LP=0):
|
40 |
-
"""
|
41 |
-
Calculate the estimated win rate for player A against player B using their Elo ratings.
|
42 |
-
:param elo_a: Elo rating of player A
|
43 |
-
:param elo_b: Elo rating of player B
|
44 |
-
:return: Estimated win rate for player A
|
45 |
-
"""
|
46 |
-
exponent = (elo_b - elo_a)*(10**LP) / 400
|
47 |
-
probability_a_wins = 1 / (1 + 10 ** exponent)
|
48 |
-
return (1-probability_a_wins)*100
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
# Formats the columns
|
53 |
-
def formatter(x):
|
54 |
-
if type(x) is str:
|
55 |
-
x = x
|
56 |
-
else:
|
57 |
-
x = round(x, 1)
|
58 |
-
return x
|
59 |
-
|
60 |
-
|
61 |
-
def add_winrates(current_df, LP=0):
|
62 |
-
df = current_df.copy()
|
63 |
-
elo_column = "Task-Avg Elo"
|
64 |
-
|
65 |
-
# Correct way to filter the DataFrame and get the Elo rating for "gpt-4-0125-preview"
|
66 |
-
model_a_elo = df[df["Model"].str.contains("gpt-4")][elo_column].iloc[0]
|
67 |
-
|
68 |
-
# Correct way to filter the DataFrame and get the Elo rating for "gpt-3.5-turbo-0125"
|
69 |
-
model_b_elo = df[df["Model"].str.contains("gpt-3.5")][elo_column].iloc[0]
|
70 |
-
|
71 |
-
|
72 |
-
# Calculate the win rate of "gpt-4-0125-preview" against all models
|
73 |
-
df['Win% vs GPT-4'] = df[elo_column].apply(lambda x: estimated_win_rate(model_a_elo, x, LP=LP)).apply(formatter)
|
74 |
-
df['Win% vs GPT-3.5T'] = df[elo_column].apply(lambda x: estimated_win_rate(model_b_elo, x, LP=LP)).apply(formatter)
|
75 |
-
# apply the formatter for the two new columns
|
76 |
-
cols = list(df.columns)
|
77 |
-
cols.remove("# battles"); cols.append("# battles")
|
78 |
-
cols.remove("Length"); cols.append("Length")
|
79 |
-
df = df[cols]
|
80 |
-
return df
|
81 |
-
|
82 |
-
def add_winrates_tasks(current_df, ref="gpt-4", LP=0):
|
83 |
-
new_df = current_df.copy()
|
84 |
-
for t in all_task_types:
|
85 |
-
column = column_names[t]
|
86 |
-
model_a_elo = current_df[current_df["Model"].str.contains(ref)][column].iloc[0]
|
87 |
-
new_df[column] = current_df[column].apply(lambda x: estimated_win_rate(model_a_elo, x, LP=LP)).apply(formatter)
|
88 |
-
return new_df
|
89 |
-
|
90 |
-
|
91 |
-
def post_processing(df, model_len_info):
|
92 |
-
if model_len_info:
|
93 |
-
df["Length"] = df["model name "].apply(lambda x: model_len_info[x]["avg_len"])
|
94 |
-
|
95 |
-
for col in df.columns:
|
96 |
-
if col == "model name ":
|
97 |
-
df[col] = df[col].apply(lambda x: x.replace(x, make_clickable_model(x)))
|
98 |
-
else:
|
99 |
-
df[col] = df[col].apply(formatter) # For numerical values
|
100 |
-
df.rename(columns=column_names, inplace=True)
|
101 |
-
df.sort_values(by="Task-Avg Elo", inplace=True, ascending=False)
|
102 |
-
# put the "Overall Elo" and "Task-Avg Elo" column to the front
|
103 |
-
# add the length info
|
104 |
-
df = df[["Model", "Task-Avg Elo"] + [col for col in df.columns if col not in ["Model", "Task-Avg Elo"]]]
|
105 |
-
return df
|
106 |
|
107 |
def apply_length_penalty(original_df, ablation_df, length_penalty=0.2, mode='v1', LP_original_dfs=None):
|
108 |
-
"""
|
109 |
-
Temporarily disable the length penalty feature
|
110 |
-
if mode == 'v2' and LP_original_dfs is not None:
|
111 |
-
L = f"{length_penalty:.1f}"
|
112 |
-
return LP_original_dfs[L]
|
113 |
-
original_df = original_df.copy()
|
114 |
-
ablation_df = ablation_df.copy()
|
115 |
-
# replace all values in original_df with the values as z = x - y * length_penalty where y is from ablation_df at the same row and column
|
116 |
-
# except for the "Model" column and the "# battles" column
|
117 |
-
# do not assume the order of the rows are the same in both dataframes
|
118 |
-
for i, row in original_df.iterrows():
|
119 |
-
for col in original_df.columns:
|
120 |
-
if col == "Model" or col == "# battles" or col == "Length":
|
121 |
-
continue
|
122 |
-
# assert that the model names are the same in both dataframes
|
123 |
-
assert original_df.at[i, "Model"] == ablation_df[ablation_df["Model"] == row["Model"]]["Model"].values[0]
|
124 |
-
original_df[col] = original_df[col].astype(float)
|
125 |
-
if mode == "v1":
|
126 |
-
original_df.at[i, col] = original_df.at[i, col] - ablation_df[ablation_df["Model"] == row["Model"]][col].values[0] * length_penalty
|
127 |
-
elif mode == "v1.1":
|
128 |
-
diff = original_df.at[i, col] - ablation_df[ablation_df["Model"] == row["Model"]][col].values[0]
|
129 |
-
original_df.at[i, col] = original_df.at[i, col] * (1-length_penalty) + diff*length_penalty
|
130 |
-
# post_processing
|
131 |
-
original_df = post_processing(original_df, model_len_info=None)
|
132 |
-
"""
|
133 |
return original_df
|
134 |
-
|
135 |
-
def load_benchdata():
|
136 |
-
print("Loading sotopia data...")
|
137 |
-
bench_data = load_dataset("cmu-lti/sotopia", split="test")
|
138 |
-
return bench_data
|
139 |
-
|
140 |
-
def load_benchdata_dict():
|
141 |
-
print("Loading sotopia data....")
|
142 |
-
bench_data = load_dataset("cmu-lti/sotopia", data_files="sotopia_episodes_v1_hf.jsonl")['train']
|
143 |
-
id_to_data = {}
|
144 |
-
for item in bench_data:
|
145 |
-
id_to_data[item["session_id"]] = item
|
146 |
-
return id_to_data
|
147 |
-
|
148 |
-
def load_eval_results():
|
149 |
-
print("Loading sotopia Evaluation data...")
|
150 |
-
eval_results = load_dataset("WildEval/sotopia-Evaluation", "all", split="train")
|
151 |
-
return eval_results
|
152 |
-
|
153 |
-
def load_infer_results(model_name):
|
154 |
-
print(f"Loading sotopia Results for {model_name}...")
|
155 |
-
infer_results = load_dataset("WildEval/sotopia-Results", model_name, split="train")
|
156 |
-
return infer_results
|
157 |
-
|
158 |
-
def sample_an_eval_result(eval_results, model_list=[], tag_list=[]):
|
159 |
-
global id_to_data
|
160 |
-
eval_results = list(eval_results)
|
161 |
-
random.shuffle(eval_results)
|
162 |
-
for eval_item in eval_results:
|
163 |
-
# print(json.dumps(eval_item, indent=2))
|
164 |
-
# print(f"## Session ID: {eval_item['session_id']}")
|
165 |
-
# eval_item["eval_id"]
|
166 |
-
assignment = eval_item['assignment']
|
167 |
-
model_1, model_2 = eval_item['model_1'], eval_item['model_2']
|
168 |
-
model_A = model_1 if assignment['A'] == model_1 else model_2
|
169 |
-
model_B = model_2 if assignment['B'] == model_2 else model_1
|
170 |
-
if len(model_list) >= 2:
|
171 |
-
if model_A not in model_list or model_B not in model_list:
|
172 |
-
continue
|
173 |
-
elif len(model_list) == 1:
|
174 |
-
if model_A != model_list[0] and model_B != model_list[0]:
|
175 |
-
continue
|
176 |
-
else:
|
177 |
-
pass
|
178 |
-
if tag_list:
|
179 |
-
if set(tag_list).isdisjoint(set(eval_item['tags'])):
|
180 |
-
continue
|
181 |
-
winner = eval_item['winner']
|
182 |
-
# print(f"## Model A: {model_A} | Model B: {model_B} | Winner: {winner}")
|
183 |
-
task_type = eval_item['tags'][0] # primary task type
|
184 |
-
chat_history = eval_item['history']
|
185 |
-
last_query = eval_item['last_query']
|
186 |
-
# print(f"## Task Type: {task_type}")
|
187 |
-
# print(f"## Chat History: {chat_history}")
|
188 |
-
# print(f"## Last Query --> USER: {last_query}")
|
189 |
-
|
190 |
-
model_A_output = eval_item['model_1_output'] if model_1 == model_A else eval_item['model_2_output']
|
191 |
-
model_B_output = eval_item['model_2_output'] if model_2 == model_B else eval_item['model_1_output']
|
192 |
-
|
193 |
-
if len(model_A_output.strip()) == 0 or len(model_B_output.strip()) == 0:
|
194 |
-
continue
|
195 |
-
|
196 |
-
conversation_input = id_to_data[eval_item['session_id']]["conversation_input"]
|
197 |
-
# print(f"\n\n\n## Model A ({model_A}) Output ##\n{model_A_output}")
|
198 |
-
# print(f"\n\n\n## Model B ({model_B}) Output ##\n{model_B_output}")
|
199 |
-
|
200 |
-
# print(f"\n\n\n## Winner ##\n{winner}")
|
201 |
-
# print(f"\n\n\n## GPT-4 Judgement ##\n{eval_item['parsed_result']}")
|
202 |
-
|
203 |
-
result_dict = {
|
204 |
-
"session_id": eval_item['session_id'],
|
205 |
-
"model_A": model_A,
|
206 |
-
"model_B": model_B,
|
207 |
-
"winner": winner,
|
208 |
-
"intent": id_to_data[eval_item['session_id']]["intent"],
|
209 |
-
"task_type": task_type,
|
210 |
-
"all_tags": eval_item['tags'],
|
211 |
-
"chat_history": chat_history,
|
212 |
-
"last_query": last_query,
|
213 |
-
"conversation_input": conversation_input,
|
214 |
-
"model_A_output": model_A_output,
|
215 |
-
"model_B_output": model_B_output,
|
216 |
-
"reason": eval_item['parsed_result']["reason"],
|
217 |
-
"choice": eval_item['parsed_result']["choice"],
|
218 |
-
"checklist": id_to_data[eval_item['session_id']]["checklist"],
|
219 |
-
}
|
220 |
-
break
|
221 |
-
return result_dict
|
222 |
-
|
223 |
-
#id_to_data = load_benchdata_dict()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
|
2 |
def apply_length_penalty(original_df, ablation_df, length_penalty=0.2, mode='v1', LP_original_dfs=None):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
return original_df
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|