Xuhui wdplx commited on
Commit
0adb6ea
·
1 Parent(s): 7d4925b

Leaderboard and Unified UI (#61)

Browse files

* Update start_app.sh to use gradio instead of python app.py

* fixed action typing error

---------

Co-authored-by: Jasonqi146 <jasonqi146@gmail.com>

README.md CHANGED
@@ -11,3 +11,16 @@ license: apache-2.0
11
  ---
12
 
13
  This is a synced repository with a Huggingface Space for the Sotopia project [space](https://huggingface.co/spaces/wdplx/Sotopia-demo)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  ---
12
 
13
  This is a synced repository with a Huggingface Space for the Sotopia project [space](https://huggingface.co/spaces/wdplx/Sotopia-demo)
14
+
15
+ ## Getting Started
16
+
17
+ ```bash
18
+ conda create -n sotopia-space python=3.11; conda activate sotopia-space
19
+ python -m pip install -r requirements.txt
20
+ ```
21
+
22
+ To run the app, run the following command:
23
+
24
+ ```bash
25
+ bash start_app.sh
26
+ ```
app.py CHANGED
@@ -1,332 +1,83 @@
1
  import os
2
- from collections import defaultdict
3
- import json
4
  from typing import Literal
5
 
6
- import gradio as gr
 
 
 
7
 
8
- from utils import Environment, Agent, get_context_prompt, dialogue_history_prompt
9
- from functools import cache
10
- from sotopia_pi_generate import prepare_model, generate_action
11
 
12
  OPENAI_KEY_FILE="./openai_api.key"
13
  if os.path.exists(OPENAI_KEY_FILE):
14
  with open(OPENAI_KEY_FILE, "r") as f:
15
  os.environ["OPENAI_API_KEY"] = f.read().strip()
16
 
17
- DEPLOYED = os.getenv("DEPLOYED", "true").lower() == "true"
18
- DEFAULT_MODEL_SELECTION = "gpt-3.5-turbo"
19
- TEMPERATURE = 0.7
20
- TOP_P = 1
21
- MAX_TOKENS = 1024
22
 
23
- ENVIRONMENT_PROFILES = "profiles/environment_profiles.jsonl"
24
- AGENT_PROFILES = "profiles/agent_profiles.jsonl"
25
- RELATIONSHIP_PROFILES = "profiles/relationship_profiles.jsonl"
26
-
27
- ACTION_TYPES = ['none', 'action', 'non-verbal communication', 'speak', 'leave']
28
-
29
- MODEL_OPTIONS = [
30
- "gpt-3.5-turbo",
31
- "gpt-4",
32
- "gpt-4-turbo",
33
- "cmu-lti/sotopia-pi-mistral-7b-BC_SR",
34
- "cmu-lti/sotopia-pi-mistral-7b-BC_SR_4bit",
35
- "mistralai/Mistral-7B-Instruct-v0.1"
36
- # "mistralai/Mixtral-8x7B-Instruct-v0.1",
37
- # "togethercomputer/llama-2-7b-chat",
38
- # "togethercomputer/llama-2-70b-chat",
39
- # "togethercomputer/mpt-30b-chat",
40
- # "together_ai/togethercomputer/llama-2-7b-chat",
41
- # "together_ai/togethercomputer/falcon-7b-instruct",
42
- ]
43
-
44
- @cache
45
- def get_sotopia_profiles(env_file=ENVIRONMENT_PROFILES, agent_file=AGENT_PROFILES, relationship_file=RELATIONSHIP_PROFILES):
46
- with open(env_file, 'r') as f:
47
- data = [json.loads(line) for line in f.readlines()]
48
-
49
- code_names_count = defaultdict(int)
50
- environments = []
51
- environment_dict = {}
52
- for profile in sorted(data, key=lambda x: x['codename']):
53
- env_obj = Environment(profile)
54
- if profile['codename'] in code_names_count:
55
- environments.append((
56
- "{}_{:05d}".format(profile['codename'],
57
- code_names_count[profile['codename']]
58
- ),
59
- env_obj._id
60
- ))
61
- else:
62
- environments.append((profile['codename'], env_obj._id))
63
- environment_dict[env_obj._id] = env_obj
64
- code_names_count[profile['codename']] += 1
65
-
66
- with open(agent_file, 'r') as f:
67
- data = [json.loads(line) for line in f.readlines()]
68
-
69
- agent_dict = {}
70
- for profile in data:
71
- agent_obj = Agent(profile)
72
- agent_dict[agent_obj._id] = agent_obj
73
-
74
- with open(relationship_file, 'r') as f:
75
- data = [json.loads(line) for line in f.readlines()]
76
-
77
- relationship_dict = defaultdict(lambda : defaultdict(list))
78
- for profile in data:
79
- relationship_dict[profile['relationship']][profile['agent1_id']].append(profile['agent2_id'])
80
- relationship_dict[profile['relationship']][profile['agent2_id']].append(profile['agent1_id'])
81
-
82
- return environments, environment_dict, agent_dict, relationship_dict
83
-
84
-
85
- def introduction():
86
  with gr.Column(scale=2):
87
- gr.Image(
88
- "images/sotopia.jpg", elem_id="banner-image", show_label=False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
  )
90
- with gr.Column(scale=5):
91
- gr.Markdown(
92
- """# Sotopia Space
93
- **Chat with different social agent models including [sotopia-pi](https://github.com/sotopia-lab/sotopia-pi), GPT and so on in sotopia space!**
94
-
95
- ➡️️ **Intended Use**: Sotopia space is intended to showcase the social intelligence ability of different social agents in interesting social scenarios.
96
-
97
- **Guidance**:
98
-
99
- Step (1) Select a social scenario that interests you in "Scenario Selection"
100
-
101
- Step (2) Select a social agent you want to chat with in "Model Selection"
102
-
103
- Step (3) Select which character you and your social agent will play in the scenario in "User Agent Selection" and "Bot Agent Selection"
104
-
105
- Step (4) Negotiate/debate/cooperate with the social agent to see whether your goal or their social goal can be achieved.
106
-
107
- ⚠️ **Limitations**: The social agent can and will produce factually incorrect information, hallucinating facts and potentially offensive actions. It can produce problematic outputs, especially if prompted to do so.
108
-
109
- 🗄️ **Disclaimer**: User prompts and generated replies from the model may be collected solely for the purpose of pure academic research. By using this demo, users implicitly agree to these terms.
110
- """
111
- )
112
-
113
- def create_user_agent_dropdown(environment_id):
114
- _, environment_dict, agent_dict, relationship_dict = get_sotopia_profiles()
115
- environment = environment_dict[environment_id]
116
-
117
- user_agents_list = []
118
- unique_agent_ids = set()
119
- for x, _ in relationship_dict[environment.relationship].items():
120
- unique_agent_ids.add(x)
121
-
122
- for agent_id in unique_agent_ids:
123
- user_agents_list.append((agent_dict[agent_id].name, agent_id))
124
- return gr.Dropdown(choices=user_agents_list, value=user_agents_list[0][1] if user_agents_list else None, label="User Agent Selection")
125
-
126
- def create_bot_agent_dropdown(environment_id, user_agent_id):
127
- _, environment_dict, agent_dict, relationship_dict = get_sotopia_profiles()
128
- environment, user_agent = environment_dict[environment_id], agent_dict[user_agent_id]
129
-
130
- bot_agent_list = []
131
- for neighbor_id in relationship_dict[environment.relationship][user_agent.agent_id]:
132
- bot_agent_list.append((agent_dict[neighbor_id].name, neighbor_id))
133
-
134
- return gr.Dropdown(choices=bot_agent_list, value=bot_agent_list[0][1] if bot_agent_list else None, label="Bot Agent Selection")
135
-
136
- def create_environment_info(environment_dropdown):
137
- _, environment_dict, _, _ = get_sotopia_profiles()
138
- environment = environment_dict[environment_dropdown]
139
- text = environment.scenario
140
- return gr.Textbox(label="Scenario", lines=1, value=text)
141
-
142
- def create_user_info(user_agent_dropdown):
143
- _, _, agent_dict, _ = get_sotopia_profiles()
144
- user_agent = agent_dict[user_agent_dropdown]
145
- text = f"{user_agent.background} {user_agent.personality}"
146
- return gr.Textbox(label="User Agent Profile", lines=4, value=text)
147
-
148
- def create_bot_info(bot_agent_dropdown):
149
- _, _, agent_dict, _ = get_sotopia_profiles()
150
- bot_agent = agent_dict[bot_agent_dropdown]
151
- text = f"{bot_agent.background} {bot_agent.personality}"
152
- return gr.Textbox(label="Bot Agent Profile", lines=4, value=text)
153
-
154
- def create_user_goal(environment_dropdown):
155
- _, environment_dict, _, _ = get_sotopia_profiles()
156
- text = environment_dict[environment_dropdown].agent_goals[0]
157
- text = text.replace('(', '').replace(')', '')
158
- if "<extra_info>" in text:
159
- text = text.replace("<extra_info>", "\n\n")
160
- text = text.replace("</extra_info>", "\n")
161
- if "<strategy_hint>" in text:
162
- text = text.replace("<strategy_hint>", "\n\n")
163
- text = text.replace("</strategy_hint>", "\n")
164
- return gr.Textbox(label="User Agent Goal", lines=4, value=text)
165
-
166
- def create_bot_goal(environment_dropdown):
167
- _, environment_dict, _, _ = get_sotopia_profiles()
168
- text = environment_dict[environment_dropdown].agent_goals[1]
169
- text = text.replace('(', '').replace(')', '')
170
- if "<extra_info>" in text:
171
- text = text.replace("<extra_info>", "\n\n")
172
- text = text.replace("</extra_info>", "\n")
173
- if "<strategy_hint>" in text:
174
- text = text.replace("<strategy_hint>", "\n\n")
175
- text = text.replace("</strategy_hint>", "\n")
176
- return gr.Textbox(label="Bot Agent Goal", lines=4, value=text)
177
-
178
- def sotopia_info_accordion(accordion_visible=True):
179
- environments, _, _, _ = get_sotopia_profiles()
180
-
181
- with gr.Accordion("Create your sotopia space!", open=accordion_visible):
182
- with gr.Row():
183
- environment_dropdown = gr.Dropdown(
184
- choices=environments,
185
- label="Scenario Selection",
186
- value=environments[0][1] if environments else None,
187
- interactive=True,
188
- )
189
- model_name_dropdown = gr.Dropdown(
190
- choices=MODEL_OPTIONS,
191
- value=DEFAULT_MODEL_SELECTION,
192
- interactive=True,
193
- label="Model Selection"
194
- )
195
-
196
- with gr.Row():
197
- user_agent_dropdown = create_user_agent_dropdown(environment_dropdown.value)
198
- bot_agent_dropdown = create_bot_agent_dropdown(environment_dropdown.value, user_agent_dropdown.value)
199
-
200
- with gr.Accordion("Check your social task!", open=accordion_visible):
201
-
202
- scenario_info_display = create_environment_info(environment_dropdown.value)
203
-
204
- with gr.Row():
205
- bot_goal_display = create_bot_goal(environment_dropdown.value)
206
- user_goal_display = create_user_goal(environment_dropdown.value)
207
-
208
-
209
-
210
- with gr.Row():
211
- bot_agent_info_display = create_bot_info(bot_agent_dropdown.value)
212
- user_agent_info_display = create_user_info(user_agent_dropdown.value)
213
-
214
- # Update user dropdown when scenario changes
215
- environment_dropdown.change(fn=create_user_agent_dropdown, inputs=[environment_dropdown], outputs=[user_agent_dropdown])
216
- # Update bot dropdown when user or scenario changes
217
- user_agent_dropdown.change(fn=create_bot_agent_dropdown, inputs=[environment_dropdown, user_agent_dropdown], outputs=[bot_agent_dropdown])
218
- # Update scenario information when scenario changes
219
- environment_dropdown.change(fn=create_environment_info, inputs=[environment_dropdown], outputs=[scenario_info_display])
220
- # Update user agent profile when user changes
221
- user_agent_dropdown.change(fn=create_user_info, inputs=[user_agent_dropdown], outputs=[user_agent_info_display])
222
- # Update bot agent profile when bot changes
223
- bot_agent_dropdown.change(fn=create_bot_info, inputs=[bot_agent_dropdown], outputs=[bot_agent_info_display])
224
- # Update user goal when scenario changes
225
- environment_dropdown.change(fn=create_user_goal, inputs=[environment_dropdown], outputs=[user_goal_display])
226
- # Update bot goal when scenario changes
227
- environment_dropdown.change(fn=create_bot_goal, inputs=[environment_dropdown], outputs=[bot_goal_display])
228
-
229
- return model_name_dropdown, environment_dropdown, user_agent_dropdown, bot_agent_dropdown
230
-
231
- def instructions_accordion(instructions, according_visible=False):
232
- with gr.Accordion("Instructions", open=False, visible=according_visible):
233
- instructions = gr.Textbox(
234
- lines=10,
235
- value=instructions,
236
- interactive=False,
237
- placeholder="Instructions",
238
- show_label=False,
239
- max_lines=10,
240
- visible=False,
241
- )
242
- return instructions
243
-
244
-
245
- def chat_tab():
246
- # history are input output pairs
247
- _, environment_dict, agent_dict, _ = get_sotopia_profiles()
248
- def run_chat(
249
- message,
250
- history,
251
- environment_selection,
252
- user_agent_dropdown,
253
- bot_agent_dropdown,
254
- model_selection:str
255
- ):
256
- environment = environment_dict[environment_selection]
257
- user_agent = agent_dict[user_agent_dropdown]
258
- bot_agent = agent_dict[bot_agent_dropdown]
259
-
260
- context = get_context_prompt(bot_agent, user_agent, environment)
261
- dialogue_history, next_turn_idx = dialogue_history_prompt(message, history, user_agent, bot_agent)
262
- prompt_history = f"{context}{dialogue_history}"
263
- agent_action = generate_action(model_selection, prompt_history, next_turn_idx, ACTION_TYPES, bot_agent.name, TEMPERATURE)
264
- return agent_action.to_natural_language()
265
-
266
- with gr.Column():
267
- with gr.Blocks():
268
- model_name_dropdown, scenario_dropdown, user_agent_dropdown, bot_agent_dropdown = sotopia_info_accordion()
269
-
270
- with gr.Column():
271
- with gr.Accordion("Start the conversation to achieve your goal!", open=True):
272
- gr.ChatInterface(
273
- fn=run_chat,
274
- chatbot=gr.Chatbot(
275
- height=620,
276
- render=False,
277
- show_label=False,
278
- rtl=False,
279
- avatar_images=(
280
- "images/profile1.jpg",
281
- "images/profile2.jpg",
282
- ),
283
- ),
284
- textbox=gr.Textbox(
285
- placeholder="Write your message here...",
286
- render=False,
287
- scale=7,
288
- rtl=False,
289
- ),
290
- additional_inputs=[
291
- scenario_dropdown,
292
- user_agent_dropdown,
293
- bot_agent_dropdown,
294
- model_name_dropdown,
295
- ],
296
- submit_btn="Send",
297
- stop_btn="Stop",
298
- retry_btn="🔄 Retry",
299
- undo_btn="↩️ Delete",
300
- clear_btn="🗑️ Clear",
301
- )
302
-
303
-
304
- def main():
305
- with gr.Blocks(
306
- css="""#chat_container {height: 820px; width: 1000px; margin-left: auto; margin-right: auto;}
307
- #chatbot {height: 600px; overflow: auto;}
308
- #create_container {height: 750px; margin-left: 0px; margin-right: 0px;}
309
- #tokenizer_renderer span {white-space: pre-wrap}
310
- """
311
- ) as demo:
312
- with gr.Row():
313
- introduction()
314
- with gr.Row():
315
- chat_tab()
316
-
317
- return demo
318
-
319
-
320
- def start_demo():
321
- demo = main()
322
- if DEPLOYED:
323
- demo.queue(api_open=False).launch(show_api=False)
324
- else:
325
- demo.queue()
326
- demo.launch(share=False, server_name="0.0.0.0")
327
 
328
 
329
  if __name__ == "__main__":
 
 
 
330
  get_sotopia_profiles()
331
  # prepare_model(DEFAULT_MODEL_SELECTION)
332
- start_demo()
 
1
  import os
2
+ import argparse
 
3
  from typing import Literal
4
 
5
+ import gradio as gr # type: ignore
6
+ from sotopia_space.chat import chat_introduction, chat_tab, get_sotopia_profiles
7
+ from sotopia_space import benchmark
8
+ from ui_constants import CITATION_TEXT, BANNER
9
 
 
 
 
10
 
11
  OPENAI_KEY_FILE="./openai_api.key"
12
  if os.path.exists(OPENAI_KEY_FILE):
13
  with open(OPENAI_KEY_FILE, "r") as f:
14
  os.environ["OPENAI_API_KEY"] = f.read().strip()
15
 
16
+ with open("./sotopia_space/_header.md", "r") as f:
17
+ HEADER_MD = f.read()
 
 
 
18
 
19
+ def navigation_bar():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  with gr.Column(scale=2):
21
+ toggle_dark = gr.Button(value="Toggle Dark")
22
+ toggle_dark.click(
23
+ None,
24
+ js="""
25
+ () => {
26
+ if (document.body.classList.contains('dark')) {
27
+ document.body.classList.remove('dark');
28
+ document.querySelector('gradio-app').style.backgroundColor = 'var(--color-background-primary-light)';
29
+ } else {
30
+ document.body.classList.add('dark');
31
+ document.querySelector('gradio-app').style.backgroundColor = 'var(--color-background-primary-dark)';
32
+ }
33
+ }
34
+ """,
35
+ )
36
+
37
+ with gr.Blocks(
38
+ css="""#chat_container {height: 820px; width: 1000px; margin-left: auto; margin-right: auto;}
39
+ #chatbot {height: 600px; overflow: auto;}
40
+ #create_container {height: 750px; margin-left: 0px; margin-right: 0px;}
41
+ #tokenizer_renderer span {white-space: pre-wrap}
42
+ """,
43
+ theme="gradio/monochrome",
44
+ ) as demo:
45
+ # with gr.Row():
46
+ # navigation_bar()
47
+ gr.Image(
48
+ "images/banner.png", elem_id="banner-image", show_label=False
49
  )
50
+ gr.Markdown(HEADER_MD, elem_classes="markdown-text")
51
+ with gr.Tabs(elem_classes="tab-buttons") as tabs:
52
+ with gr.TabItem("🏅 Leaderboard", elem_id="benchmark-tab-table", id=0):
53
+ benchmark.benchmark_table()
54
+ with gr.TabItem("💬 Chat", elem_id="chat-tab-interface", id=1):
55
+ with gr.Row():
56
+ chat_introduction()
57
+ with gr.Row():
58
+ chat_tab()
59
+ with gr.Row():
60
+ with gr.Accordion("📙 Citation", open=False, elem_classes="accordion-label"):
61
+ gr.Textbox(
62
+ value=CITATION_TEXT,
63
+ lines=7,
64
+ label="Copy the BibTeX snippet to cite this source",
65
+ elem_id="citation-button",
66
+ show_copy_button=True)
67
+
68
+ # def start_demo():
69
+ # demo = main()
70
+ # if DEPLOYED:
71
+ # demo.queue(api_open=False).launch(show_api=False)
72
+ # else:
73
+ # demo.queue()
74
+ # demo.launch(share=False, server_name="0.0.0.0")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
 
76
 
77
  if __name__ == "__main__":
78
+ parser = argparse.ArgumentParser()
79
+ parser.add_argument("--result_file", help="Path to results table", default="data_dir/models_vs_gpt35.jsonl")
80
+ #benchmark.original_df = pd.read_json(args.result_file, lines=True)
81
  get_sotopia_profiles()
82
  # prepare_model(DEFAULT_MODEL_SELECTION)
83
+ demo.launch()
data_dir/models_vs_gpt35.jsonl ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {"model_name": "GPT-4", "SOC [-10, 0]": -0.07, "SEC [-10, 0]": -0.14, "FIN [-5, 5]": 0.81, "REL [-5, 5]": 1.94, "KNO [0, 10]": 3.73, "GOAL [0, 10]": 7.62, "BEL [0, 10]": 9.28}
2
+ {"model_name": "GPT-3.5", "SOC [-10, 0]": -0.08, "SEC [-10, 0]": -0.08, "FIN [-5, 5]": 0.46, "REL [-5, 5]": 1.23, "KNO [0, 10]": 3.4, "GOAL [0, 10]": 6.45, "BEL [0, 10]": 9.15}
3
+ {"model_name": "Llama-2", "SOC [-10, 0]": -0.11, "SEC [-10, 0]": -0.14, "FIN [-5, 5]": 0.4, "REL [-5, 5]": 0.91, "KNO [0, 10]": 3.11, "GOAL [0, 10]": 5.38, "BEL [0, 10]": 8.1}
4
+ {"model_name": "MPT", "SOC [-10, 0]": -0.09, "SEC [-10, 0]": -0.07, "FIN [-5, 5]": 0.28, "REL [-5, 5]": 0.58, "KNO [0, 10]": 2.11, "GOAL [0, 10]": 4.1, "BEL [0, 10]": 6.17}
requirements.txt CHANGED
@@ -8,7 +8,7 @@ annotated-types==0.6.0
8
  anyio==3.7.1
9
  attrs==23.2.0
10
  beartype==0.14.1
11
- bitsandbytes==0.43.1
12
  certifi==2024.2.2
13
  cffi==1.16.0
14
  charset-normalizer==3.3.2
@@ -68,18 +68,18 @@ mypy-extensions==1.0.0
68
  names==0.3.0
69
  networkx==3.3
70
  numpy==1.26.4
71
- nvidia-cublas-cu12==12.1.3.1
72
- nvidia-cuda-cupti-cu12==12.1.105
73
- nvidia-cuda-nvrtc-cu12==12.1.105
74
- nvidia-cuda-runtime-cu12==12.1.105
75
- nvidia-cudnn-cu12==8.9.2.26
76
- nvidia-cufft-cu12==11.0.2.54
77
- nvidia-curand-cu12==10.3.2.106
78
- nvidia-cusolver-cu12==11.4.5.107
79
- nvidia-cusparse-cu12==12.1.0.106
80
- nvidia-nccl-cu12==2.19.3
81
- nvidia-nvjitlink-cu12==12.4.127
82
- nvidia-nvtx-cu12==12.1.105
83
  openai==1.22.0
84
  orjson==3.10.1
85
  packaging==23.2
@@ -129,7 +129,7 @@ toolz==0.12.1
129
  torch==2.2.2
130
  tqdm==4.66.2
131
  transformers==4.40.0
132
- triton==2.2.0
133
  typer==0.12.3
134
  types-cffi==1.16.0.20240331
135
  types-pyOpenSSL==24.0.0.20240417
 
8
  anyio==3.7.1
9
  attrs==23.2.0
10
  beartype==0.14.1
11
+ bitsandbytes==0.42.0
12
  certifi==2024.2.2
13
  cffi==1.16.0
14
  charset-normalizer==3.3.2
 
68
  names==0.3.0
69
  networkx==3.3
70
  numpy==1.26.4
71
+ # nvidia-cublas-cu12==12.1.3.1
72
+ # nvidia-cuda-cupti-cu12==12.1.105
73
+ # nvidia-cuda-nvrtc-cu12==12.1.105
74
+ # nvidia-cuda-runtime-cu12==12.1.105
75
+ # nvidia-cudnn-cu12==8.9.2.26
76
+ # nvidia-cufft-cu12==11.0.2.54
77
+ # nvidia-curand-cu12==10.3.2.106
78
+ # nvidia-cusolver-cu12==11.4.5.107
79
+ # nvidia-cusparse-cu12==12.1.0.106
80
+ # nvidia-nccl-cu12==2.19.3
81
+ # nvidia-nvjitlink-cu12==12.4.127
82
+ # nvidia-nvtx-cu12==12.1.105
83
  openai==1.22.0
84
  orjson==3.10.1
85
  packaging==23.2
 
129
  torch==2.2.2
130
  tqdm==4.66.2
131
  transformers==4.40.0
132
+ # triton==2.2.0
133
  typer==0.12.3
134
  types-cffi==1.16.0.20240331
135
  types-pyOpenSSL==24.0.0.20240417
sotopia_space/_header.md ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ <br/>
2
+
3
+ # Sotopia Space: A Huggingface Space for the Sotopia projects
4
+ [⚙️ GitHub](https://github.com/sotopia-lab) | [🤗 HuggingFace](https://huggingface.co/collections/cmu-lti/sotopia-65f312c1bd04a8c4a9225e5b) | [💬 Discussions](https://github.com/orgs/sotopia-lab/discussions)
sotopia_space/benchmark.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr # type: ignore
2
+ import pandas as pd
3
+ from sotopia_space.constants import MODEL_OPTIONS
4
+ from sotopia_space.utils import estimated_win_rate, make_clickable_model, styled_error, styled_warning, styled_message,apply_length_penalty
5
+
6
+ LP_MODE = "v2"
7
+ original_df, ablation_df = None, None
8
+ LP_original_dfs = {}
9
+ DEFAULT_LP = 0.5
10
+
11
+ available_models = [] # to be filled in later
12
+ original_df, ablation_df = None, None
13
+
14
+ def slider_change_main(length_penalty):
15
+ global original_df, ablation_df, LP_MODE
16
+ adjusted_df = apply_length_penalty(original_df, ablation_df, length_penalty, mode=LP_MODE, LP_original_dfs=LP_original_dfs)
17
+ adjusted_df = adjusted_df[["Model", "Overall Elo", "Task-Avg Elo", "# battles", "Length"]]
18
+ adjusted_df = adjusted_df.sort_values(by="Overall Elo", ascending=False)
19
+ # adjusted_df = add_winrates(adjusted_df, LP=length_penalty)
20
+ # adjusted_df = adjusted_df.drop(columns=["Length"])
21
+ adjusted_df.insert(0, "Rank", range(1, 1 + len(adjusted_df)))
22
+ return adjusted_df
23
+
24
+ def slider_change_full(length_penalty, show_winrate):
25
+ global original_df, ablation_df, LP_MODE
26
+ adjusted_df = apply_length_penalty(original_df, ablation_df, length_penalty, mode=LP_MODE, LP_original_dfs=LP_original_dfs)
27
+ # sort the model by the "Task-Avg Elo" column
28
+ adjusted_df = adjusted_df.sort_values(by="Overall Elo", ascending=False)
29
+ adjusted_df.drop(columns=["Overall Elo", "Task-Avg Elo", "# battles", "Length"], inplace=True)
30
+ if show_winrate == "none":
31
+ adjusted_df.insert(0, "Rank", range(1, 1 + len(adjusted_df)))
32
+ return adjusted_df
33
+ elif show_winrate == "gpt-3.5":
34
+ adjusted_df = add_winrates_tasks(adjusted_df, ref="gpt-3.5", LP=length_penalty)
35
+ elif show_winrate == "gpt-4":
36
+ adjusted_df = add_winrates_tasks(adjusted_df, ref="gpt-4", LP=length_penalty)
37
+ adjusted_df.insert(0, "Rank", range(1, 1 + len(adjusted_df)))
38
+ return adjusted_df
39
+
40
+ def benchmark_table():
41
+ global original_df, ablation_df
42
+ global LP_original_dfs, LP_MODE
43
+
44
+ gr.Markdown(f"**Version**: sotopia (v1.01; 2024.04.22) | **# Examples**: 7200 | **# Models**: {len(MODEL_OPTIONS)} | **# Comparisons**: x", elem_classes="markdown-text")
45
+
46
+ with gr.TabItem("Vs GPT-3.5", elem_id="od-benchmark-tab-table-ablation", id=0, elem_classes="subtab"):
47
+ # original_df, ablation_df = skip_empty_original_df, skip_empty_ablation_df
48
+ original_df = pd.read_json('data_dir/models_vs_gpt35.jsonl', lines=True)
49
+ default_main_df = apply_length_penalty(original_df, ablation_df, length_penalty=DEFAULT_LP, mode=LP_MODE, LP_original_dfs=LP_original_dfs)
50
+ default_main_df = default_main_df.sort_values(by="GOAL [0, 10]", ascending=False)
51
+ # add a Rank column to the first columnn (starting from 1)
52
+ default_main_df.insert(0, "Rank", range(1, 1 + len(default_main_df)))
53
+ with gr.Row():
54
+ with gr.Column(scale=4):
55
+ gr.Markdown("**Vs GPT3.5**: The interlocutors are compared against GPT-3.5, the baseline model.")
56
+ with gr.Column(scale=1):
57
+ length_penlty_slider = gr.Slider(minimum=0.1, maximum=1, step=0.1, value=DEFAULT_LP, label="Length Penalty", elem_id="length-penalty-slider")
58
+ # checkbox_skip_empty = gr.Checkbox(label="Skip empty results", value=False, elem_id="skip-empty-checkbox", scale=2)
59
+ TYPES = ["number", "markdown", "number"]
60
+ leaderboard_table = gr.components.Dataframe(
61
+ value=default_main_df,
62
+ datatype=TYPES,
63
+ # max_rows=None,
64
+ height=1000,
65
+ elem_id="leaderboard-table",
66
+ interactive=False,
67
+ visible=True,
68
+ min_width=60,
69
+ )
70
+ #length_penlty_slider.change(fn=slider_change_main, inputs=[length_penlty_slider], outputs=[leaderboard_table])
sotopia_space/chat.py ADDED
@@ -0,0 +1,284 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import gradio as gr # type: ignore
3
+ # Functions for creating the chat interface
4
+ from functools import cache
5
+ from typing import Literal
6
+ import json
7
+ from collections import defaultdict
8
+ from utils import Environment, Agent, get_context_prompt, dialogue_history_prompt
9
+ from sotopia_pi_generate import prepare_model, generate_action
10
+ from sotopia_space.constants import MODEL_OPTIONS
11
+
12
+ DEPLOYED = os.getenv("DEPLOYED", "true").lower() == "true"
13
+ DEFAULT_MODEL_SELECTION = "gpt-3.5-turbo"
14
+ TEMPERATURE = 0.7
15
+ TOP_P = 1
16
+ MAX_TOKENS = 1024
17
+
18
+ ENVIRONMENT_PROFILES = "profiles/environment_profiles.jsonl"
19
+ AGENT_PROFILES = "profiles/agent_profiles.jsonl"
20
+ RELATIONSHIP_PROFILES = "profiles/relationship_profiles.jsonl"
21
+
22
+ Action = Literal['none', 'action', 'non-verbal communication', 'speak', 'leave']
23
+ ACTION_TYPES: list[Action] = ['none', 'action', 'non-verbal communication', 'speak', 'leave']
24
+
25
+
26
+
27
+ @cache
28
+ def get_sotopia_profiles(env_file=ENVIRONMENT_PROFILES, agent_file=AGENT_PROFILES, relationship_file=RELATIONSHIP_PROFILES):
29
+ with open(env_file, 'r') as f:
30
+ data = [json.loads(line) for line in f.readlines()]
31
+
32
+ code_names_count = defaultdict(int)
33
+ environments = []
34
+ environment_dict = {}
35
+ for profile in sorted(data, key=lambda x: x['codename']):
36
+ env_obj = Environment(profile)
37
+ if profile['codename'] in code_names_count:
38
+ environments.append((
39
+ "{}_{:05d}".format(profile['codename'],
40
+ code_names_count[profile['codename']]
41
+ ),
42
+ env_obj._id
43
+ ))
44
+ else:
45
+ environments.append((profile['codename'], env_obj._id))
46
+ environment_dict[env_obj._id] = env_obj
47
+ code_names_count[profile['codename']] += 1
48
+
49
+ with open(agent_file, 'r') as f:
50
+ data = [json.loads(line) for line in f.readlines()]
51
+
52
+ agent_dict = {}
53
+ for profile in data:
54
+ agent_obj = Agent(profile)
55
+ agent_dict[agent_obj._id] = agent_obj
56
+
57
+ with open(relationship_file, 'r') as f:
58
+ data = [json.loads(line) for line in f.readlines()]
59
+
60
+ relationship_dict = defaultdict(lambda : defaultdict(list))
61
+ for profile in data:
62
+ relationship_dict[profile['relationship']][profile['agent1_id']].append(profile['agent2_id'])
63
+ relationship_dict[profile['relationship']][profile['agent2_id']].append(profile['agent1_id'])
64
+
65
+ return environments, environment_dict, agent_dict, relationship_dict
66
+
67
+ def chat_introduction():
68
+ with gr.Column(scale=2):
69
+ gr.Image(
70
+ "images/sotopia.jpg", elem_id="banner-image", show_label=False
71
+ )
72
+ with gr.Column(scale=5):
73
+ gr.Markdown(
74
+ """# Sotopia Space
75
+ **Chat with different social agent models including [sotopia-pi](https://github.com/sotopia-lab/sotopia-pi), GPT and so on in sotopia space!**
76
+
77
+ ➡️️ **Intended Use**: Sotopia space is intended to showcase the social intelligence ability of different social agents in interesting social scenarios.
78
+
79
+ ✨ **Guidance**:
80
+
81
+ Step (1) Select a social scenario that interests you in "Scenario Selection"
82
+
83
+ Step (2) Select a social agent you want to chat with in "Model Selection"
84
+
85
+ Step (3) Select which character you and your social agent will play in the scenario in "User Agent Selection" and "Bot Agent Selection"
86
+
87
+ Step (4) Negotiate/debate/cooperate with the social agent to see whether your goal or their social goal can be achieved.
88
+
89
+ ⚠️ **Limitations**: The social agent can and will produce factually incorrect information, hallucinating facts and potentially offensive actions. It can produce problematic outputs, especially if prompted to do so.
90
+
91
+ 🗄️ **Disclaimer**: User prompts and generated replies from the model may be collected solely for the purpose of pure academic research. By using this demo, users implicitly agree to these terms.
92
+ """
93
+ )
94
+ # with gr.Column(scale=1):
95
+ # toggle_dark = gr.Button(value="Toggle Dark")
96
+
97
+ def create_user_agent_dropdown(environment_id):
98
+ _, environment_dict, agent_dict, relationship_dict = get_sotopia_profiles()
99
+ environment = environment_dict[environment_id]
100
+
101
+ user_agents_list = []
102
+ unique_agent_ids = set()
103
+ for x, _ in relationship_dict[environment.relationship].items():
104
+ unique_agent_ids.add(x)
105
+
106
+ for agent_id in unique_agent_ids:
107
+ user_agents_list.append((agent_dict[agent_id].name, agent_id))
108
+ return gr.Dropdown(choices=user_agents_list, value=user_agents_list[0][1] if user_agents_list else None, label="User Agent Selection")
109
+
110
+ def create_bot_agent_dropdown(environment_id, user_agent_id):
111
+ _, environment_dict, agent_dict, relationship_dict = get_sotopia_profiles()
112
+ environment, user_agent = environment_dict[environment_id], agent_dict[user_agent_id]
113
+
114
+ bot_agent_list = []
115
+ for neighbor_id in relationship_dict[environment.relationship][user_agent.agent_id]:
116
+ bot_agent_list.append((agent_dict[neighbor_id].name, neighbor_id))
117
+
118
+ return gr.Dropdown(choices=bot_agent_list, value=bot_agent_list[0][1] if bot_agent_list else None, label="Bot Agent Selection")
119
+
120
+ def create_environment_info(environment_dropdown):
121
+ _, environment_dict, _, _ = get_sotopia_profiles()
122
+ environment = environment_dict[environment_dropdown]
123
+ text = environment.scenario
124
+ return gr.Textbox(label="Scenario", lines=1, value=text)
125
+
126
+ def create_user_info(user_agent_dropdown):
127
+ _, _, agent_dict, _ = get_sotopia_profiles()
128
+ user_agent = agent_dict[user_agent_dropdown]
129
+ text = f"{user_agent.background} {user_agent.personality}"
130
+ return gr.Textbox(label="User Agent Profile", lines=4, value=text)
131
+
132
+ def create_bot_info(bot_agent_dropdown):
133
+ _, _, agent_dict, _ = get_sotopia_profiles()
134
+ bot_agent = agent_dict[bot_agent_dropdown]
135
+ text = f"{bot_agent.background} {bot_agent.personality}"
136
+ return gr.Textbox(label="Bot Agent Profile", lines=4, value=text)
137
+
138
+ def create_user_goal(environment_dropdown):
139
+ _, environment_dict, _, _ = get_sotopia_profiles()
140
+ text = environment_dict[environment_dropdown].agent_goals[0]
141
+ text = text.replace('(', '').replace(')', '')
142
+ if "<extra_info>" in text:
143
+ text = text.replace("<extra_info>", "\n\n")
144
+ text = text.replace("</extra_info>", "\n")
145
+ if "<strategy_hint>" in text:
146
+ text = text.replace("<strategy_hint>", "\n\n")
147
+ text = text.replace("</strategy_hint>", "\n")
148
+ return gr.Textbox(label="User Agent Goal", lines=4, value=text)
149
+
150
+ def create_bot_goal(environment_dropdown):
151
+ _, environment_dict, _, _ = get_sotopia_profiles()
152
+ text = environment_dict[environment_dropdown].agent_goals[1]
153
+ text = text.replace('(', '').replace(')', '')
154
+ if "<extra_info>" in text:
155
+ text = text.replace("<extra_info>", "\n\n")
156
+ text = text.replace("</extra_info>", "\n")
157
+ if "<strategy_hint>" in text:
158
+ text = text.replace("<strategy_hint>", "\n\n")
159
+ text = text.replace("</strategy_hint>", "\n")
160
+ return gr.Textbox(label="Bot Agent Goal", lines=4, value=text)
161
+
162
+ def sotopia_info_accordion(accordion_visible=True):
163
+ environments, _, _, _ = get_sotopia_profiles()
164
+
165
+ with gr.Accordion("Create your sotopia space!", open=accordion_visible):
166
+ with gr.Row():
167
+ environment_dropdown = gr.Dropdown(
168
+ choices=environments,
169
+ label="Scenario Selection",
170
+ value=environments[0][1] if environments else None,
171
+ interactive=True,
172
+ )
173
+ model_name_dropdown = gr.Dropdown(
174
+ choices=MODEL_OPTIONS,
175
+ value=DEFAULT_MODEL_SELECTION,
176
+ interactive=True,
177
+ label="Model Selection"
178
+ )
179
+
180
+ with gr.Row():
181
+ user_agent_dropdown = create_user_agent_dropdown(environment_dropdown.value)
182
+ bot_agent_dropdown = create_bot_agent_dropdown(environment_dropdown.value, user_agent_dropdown.value)
183
+
184
+ with gr.Accordion("Check your social task!", open=accordion_visible):
185
+
186
+ scenario_info_display = create_environment_info(environment_dropdown.value)
187
+
188
+ with gr.Row():
189
+ bot_goal_display = create_bot_goal(environment_dropdown.value)
190
+ user_goal_display = create_user_goal(environment_dropdown.value)
191
+
192
+
193
+
194
+ with gr.Row():
195
+ bot_agent_info_display = create_bot_info(bot_agent_dropdown.value)
196
+ user_agent_info_display = create_user_info(user_agent_dropdown.value)
197
+
198
+ # Update user dropdown when scenario changes
199
+ environment_dropdown.change(fn=create_user_agent_dropdown, inputs=[environment_dropdown], outputs=[user_agent_dropdown])
200
+ # Update bot dropdown when user or scenario changes
201
+ user_agent_dropdown.change(fn=create_bot_agent_dropdown, inputs=[environment_dropdown, user_agent_dropdown], outputs=[bot_agent_dropdown])
202
+ # Update scenario information when scenario changes
203
+ environment_dropdown.change(fn=create_environment_info, inputs=[environment_dropdown], outputs=[scenario_info_display])
204
+ # Update user agent profile when user changes
205
+ user_agent_dropdown.change(fn=create_user_info, inputs=[user_agent_dropdown], outputs=[user_agent_info_display])
206
+ # Update bot agent profile when bot changes
207
+ bot_agent_dropdown.change(fn=create_bot_info, inputs=[bot_agent_dropdown], outputs=[bot_agent_info_display])
208
+ # Update user goal when scenario changes
209
+ environment_dropdown.change(fn=create_user_goal, inputs=[environment_dropdown], outputs=[user_goal_display])
210
+ # Update bot goal when scenario changes
211
+ environment_dropdown.change(fn=create_bot_goal, inputs=[environment_dropdown], outputs=[bot_goal_display])
212
+
213
+ return model_name_dropdown, environment_dropdown, user_agent_dropdown, bot_agent_dropdown
214
+
215
+ def instructions_accordion(instructions, according_visible=False):
216
+ with gr.Accordion("Instructions", open=False, visible=according_visible):
217
+ instructions = gr.Textbox(
218
+ lines=10,
219
+ value=instructions,
220
+ interactive=False,
221
+ placeholder="Instructions",
222
+ show_label=False,
223
+ max_lines=10,
224
+ visible=False,
225
+ )
226
+ return instructions
227
+
228
+ def chat_tab():
229
+ # history are input output pairs
230
+ _, environment_dict, agent_dict, _ = get_sotopia_profiles()
231
+ def run_chat(
232
+ message,
233
+ history,
234
+ environment_selection,
235
+ user_agent_dropdown,
236
+ bot_agent_dropdown,
237
+ model_selection:str
238
+ ):
239
+ environment = environment_dict[environment_selection]
240
+ user_agent = agent_dict[user_agent_dropdown]
241
+ bot_agent = agent_dict[bot_agent_dropdown]
242
+
243
+ context = get_context_prompt(bot_agent, user_agent, environment)
244
+ dialogue_history, next_turn_idx = dialogue_history_prompt(message, history, user_agent, bot_agent)
245
+ prompt_history = f"{context}{dialogue_history}"
246
+ agent_action = generate_action(model_selection, prompt_history, next_turn_idx, ACTION_TYPES, bot_agent.name, TEMPERATURE)
247
+ return agent_action.to_natural_language()
248
+
249
+ with gr.Column():
250
+ with gr.Blocks():
251
+ model_name_dropdown, scenario_dropdown, user_agent_dropdown, bot_agent_dropdown = sotopia_info_accordion()
252
+
253
+ with gr.Column():
254
+ with gr.Accordion("Start the conversation to achieve your goal!", open=True):
255
+ gr.ChatInterface(
256
+ fn=run_chat,
257
+ chatbot=gr.Chatbot(
258
+ height=620,
259
+ render=False,
260
+ show_label=False,
261
+ rtl=False,
262
+ avatar_images=(
263
+ "images/profile1.jpg",
264
+ "images/profile2.jpg",
265
+ ),
266
+ ),
267
+ textbox=gr.Textbox(
268
+ placeholder="Write your message here...",
269
+ render=False,
270
+ scale=7,
271
+ rtl=False,
272
+ ),
273
+ additional_inputs=[
274
+ scenario_dropdown,
275
+ user_agent_dropdown,
276
+ bot_agent_dropdown,
277
+ model_name_dropdown,
278
+ ],
279
+ submit_btn="Send",
280
+ stop_btn="Stop",
281
+ retry_btn="🔄 Retry",
282
+ undo_btn="↩️ Delete",
283
+ clear_btn="🗑️ Clear",
284
+ )
sotopia_space/constants.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MODEL_OPTIONS = [
2
+ "gpt-3.5-turbo",
3
+ "gpt-4",
4
+ "gpt-4-turbo",
5
+ "cmu-lti/sotopia-pi-mistral-7b-BC_SR",
6
+ "cmu-lti/sotopia-pi-mistral-7b-BC_SR_4bit",
7
+ "mistralai/Mistral-7B-Instruct-v0.1"
8
+ # "mistralai/Mixtral-8x7B-Instruct-v0.1",
9
+ # "togethercomputer/llama-2-7b-chat",
10
+ # "togethercomputer/llama-2-70b-chat",
11
+ # "togethercomputer/mpt-30b-chat",
12
+ # "together_ai/togethercomputer/llama-2-7b-chat",
13
+ # "together_ai/togethercomputer/falcon-7b-instruct",
14
+ ]
15
+
16
+ MODEL_INFO = {
17
+ "Llama-2-13b-chat-hf.nosp": {"pretty_name": "Llama-2-13B-chat", "hf_model_id": "meta-llama/Llama-2-13b-chat-hf"},
18
+ "Llama-2-70b-chat-hf.nosp": {"pretty_name": "Llama-2-70B-chat", "hf_model_id": "meta-llama/Llama-2-70b-chat-hf"},
19
+ "Llama-2-7b-chat-hf.nosp": {"pretty_name": "Llama-2-7B-chat", "hf_model_id": "meta-llama/Llama-2-7b-chat-hf"},
20
+ "Llama-2-7b-chat-hf": {"pretty_name": "Llama-2-7B-chat (+sys prmpt)", "hf_model_id": "meta-llama/Llama-2-7b-chat-hf"},
21
+ "Mistral-7B-Instruct-v0.1": {"pretty_name": "Mistral-7B-Instruct", "hf_model_id": "mistralai/Mistral-7B-Instruct-v0.1"},
22
+ "Mistral-7B-Instruct-v0.2": {"pretty_name": "Mistral-7B-Instruct (v0.2)", "hf_model_id": "mistralai/Mistral-7B-Instruct-v0.2"},
23
+ "Mixtral-8x7B-Instruct-v0.1": {"pretty_name": "Mixtral-8x7B-Instruct", "hf_model_id": "mistralai/Mixtral-8x7B-Instruct-v0.1"},
24
+ "Nous-Hermes-2-Mixtral-8x7B-DPO": {"pretty_name": "Nous-Hermes-2-Mixtral-8x7B-DPO", "hf_model_id": "NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO"},
25
+ "Yi-34B-Chat": {"pretty_name": "Yi-34B-Chat", "hf_model_id": "01-ai/Yi-34B"},
26
+ "gemini-1.0-pro": {"pretty_name": "gemini-1.0-pro", "hf_model_id": "https://blog.google/technology/ai/google-gemini-ai/"},
27
+ "gemma-7b-it": {"pretty_name": "Gemma-7B-it", "hf_model_id": "google/gemma-7b"},
28
+ "gpt-3.5-turbo-0125": {"pretty_name": "gpt-3.5-turbo-0125", "hf_model_id": "https://platform.openai.com/"},
29
+ "gpt-4-0125-preview": {"pretty_name": "gpt-4-0125-preview", "hf_model_id": "https://platform.openai.com/"},
30
+ "tulu-2-dpo-70b": {"pretty_name": "Tulu-2-dpo-70b", "hf_model_id": "cmu-lti/tulu-2-dpo-70b"},
31
+ "vicuna-13b-v1.5": {"pretty_name": "Vicuna-13b-v1.5", "hf_model_id": "lmsys/vicuna-13b-v1.5"},
32
+ "zephyr-7b-beta": {"pretty_name": "Zephyr-7b-beta", "hf_model_id": "HuggingFaceH4/zephyr-7b-beta"},
33
+ "mistral-large-2402": {"pretty_name": "Mistral-Large", "hf_model_id": "https://mistral.ai/news/mistral-large/"},
34
+ "claude-3-opus-20240229": {"pretty_name": "Claude 3 Opus", "hf_model_id": "https://www.anthropic.com/claude"},
35
+ "claude-3-sonnet-20240229": {"pretty_name": "Claude 3 Sonnet", "hf_model_id": "https://www.anthropic.com/claude"},
36
+ "zephyr-7b-gemma-v0.1": {"pretty_name": "Zephyr-7b-Gemma", "hf_model_id": "HuggingFaceH4/zephyr-7b-gemma-v0.1"},
37
+ "Starling-LM-7B-beta": {"pretty_name": "StarlingLM-7B-beta", "hf_model_id": "Nexusflow/Starling-LM-7B-beta"},
38
+ "dbrx-instruct": {"pretty_name": "DBRX Instruct", "hf_model_id": "databricks/dbrx-instruct"}
39
+ }
sotopia_space/utils.py ADDED
@@ -0,0 +1,223 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_dataset, Dataset
2
+ import os
3
+ import json
4
+ from datasets import load_dataset
5
+ from datasets.utils.logging import disable_progress_bar # type: ignore
6
+ from ui_constants import column_names, all_task_types
7
+ import random
8
+ disable_progress_bar()
9
+ import math
10
+ from sotopia_space.constants import MODEL_INFO
11
+
12
+ id_to_data = None
13
+ model_len_info = None
14
+
15
+
16
+ def make_clickable_model(model_name):
17
+ global MODEL_INFO
18
+ if model_name in MODEL_INFO:
19
+ if MODEL_INFO[model_name]["hf_model_id"].startswith("http"):
20
+ link = MODEL_INFO[model_name]["hf_model_id"]
21
+ return f'🔒 <a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{MODEL_INFO[model_name]["pretty_name"]}</a>'
22
+ else:
23
+ link = f"https://huggingface.co/{MODEL_INFO[model_name]['hf_model_id']}"
24
+ return f'🔥 <a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{MODEL_INFO[model_name]["pretty_name"]}</a>'
25
+ else:
26
+ return model_name
27
+
28
+
29
+ def styled_error(error):
30
+ return f"<p style='color: red; font-size: 20px; text-align: center;'>{error}</p>"
31
+
32
+ def styled_warning(warn):
33
+ return f"<p style='color: orange; font-size: 20px; text-align: center;'>{warn}</p>"
34
+
35
+ def styled_message(message):
36
+ return f"<p style='color: green; font-size: 20px; text-align: center;'>{message}</p>"
37
+
38
+
39
+ def estimated_win_rate(elo_a, elo_b, LP=0):
40
+ """
41
+ Calculate the estimated win rate for player A against player B using their Elo ratings.
42
+ :param elo_a: Elo rating of player A
43
+ :param elo_b: Elo rating of player B
44
+ :return: Estimated win rate for player A
45
+ """
46
+ exponent = (elo_b - elo_a)*(10**LP) / 400
47
+ probability_a_wins = 1 / (1 + 10 ** exponent)
48
+ return (1-probability_a_wins)*100
49
+
50
+
51
+
52
+ # Formats the columns
53
+ def formatter(x):
54
+ if type(x) is str:
55
+ x = x
56
+ else:
57
+ x = round(x, 1)
58
+ return x
59
+
60
+
61
+ def add_winrates(current_df, LP=0):
62
+ df = current_df.copy()
63
+ elo_column = "Task-Avg Elo"
64
+
65
+ # Correct way to filter the DataFrame and get the Elo rating for "gpt-4-0125-preview"
66
+ model_a_elo = df[df["Model"].str.contains("gpt-4")][elo_column].iloc[0]
67
+
68
+ # Correct way to filter the DataFrame and get the Elo rating for "gpt-3.5-turbo-0125"
69
+ model_b_elo = df[df["Model"].str.contains("gpt-3.5")][elo_column].iloc[0]
70
+
71
+
72
+ # Calculate the win rate of "gpt-4-0125-preview" against all models
73
+ df['Win% vs GPT-4'] = df[elo_column].apply(lambda x: estimated_win_rate(model_a_elo, x, LP=LP)).apply(formatter)
74
+ df['Win% vs GPT-3.5T'] = df[elo_column].apply(lambda x: estimated_win_rate(model_b_elo, x, LP=LP)).apply(formatter)
75
+ # apply the formatter for the two new columns
76
+ cols = list(df.columns)
77
+ cols.remove("# battles"); cols.append("# battles")
78
+ cols.remove("Length"); cols.append("Length")
79
+ df = df[cols]
80
+ return df
81
+
82
+ def add_winrates_tasks(current_df, ref="gpt-4", LP=0):
83
+ new_df = current_df.copy()
84
+ for t in all_task_types:
85
+ column = column_names[t]
86
+ model_a_elo = current_df[current_df["Model"].str.contains(ref)][column].iloc[0]
87
+ new_df[column] = current_df[column].apply(lambda x: estimated_win_rate(model_a_elo, x, LP=LP)).apply(formatter)
88
+ return new_df
89
+
90
+
91
+ def post_processing(df, model_len_info):
92
+ if model_len_info:
93
+ df["Length"] = df["model name "].apply(lambda x: model_len_info[x]["avg_len"])
94
+
95
+ for col in df.columns:
96
+ if col == "model name ":
97
+ df[col] = df[col].apply(lambda x: x.replace(x, make_clickable_model(x)))
98
+ else:
99
+ df[col] = df[col].apply(formatter) # For numerical values
100
+ df.rename(columns=column_names, inplace=True)
101
+ df.sort_values(by="Task-Avg Elo", inplace=True, ascending=False)
102
+ # put the "Overall Elo" and "Task-Avg Elo" column to the front
103
+ # add the length info
104
+ df = df[["Model", "Task-Avg Elo"] + [col for col in df.columns if col not in ["Model", "Task-Avg Elo"]]]
105
+ return df
106
+
107
+ def apply_length_penalty(original_df, ablation_df, length_penalty=0.2, mode='v1', LP_original_dfs=None):
108
+ """
109
+ Temporarily disable the length penalty feature
110
+ if mode == 'v2' and LP_original_dfs is not None:
111
+ L = f"{length_penalty:.1f}"
112
+ return LP_original_dfs[L]
113
+ original_df = original_df.copy()
114
+ ablation_df = ablation_df.copy()
115
+ # replace all values in original_df with the values as z = x - y * length_penalty where y is from ablation_df at the same row and column
116
+ # except for the "Model" column and the "# battles" column
117
+ # do not assume the order of the rows are the same in both dataframes
118
+ for i, row in original_df.iterrows():
119
+ for col in original_df.columns:
120
+ if col == "Model" or col == "# battles" or col == "Length":
121
+ continue
122
+ # assert that the model names are the same in both dataframes
123
+ assert original_df.at[i, "Model"] == ablation_df[ablation_df["Model"] == row["Model"]]["Model"].values[0]
124
+ original_df[col] = original_df[col].astype(float)
125
+ if mode == "v1":
126
+ original_df.at[i, col] = original_df.at[i, col] - ablation_df[ablation_df["Model"] == row["Model"]][col].values[0] * length_penalty
127
+ elif mode == "v1.1":
128
+ diff = original_df.at[i, col] - ablation_df[ablation_df["Model"] == row["Model"]][col].values[0]
129
+ original_df.at[i, col] = original_df.at[i, col] * (1-length_penalty) + diff*length_penalty
130
+ # post_processing
131
+ original_df = post_processing(original_df, model_len_info=None)
132
+ """
133
+ return original_df
134
+
135
+ def load_benchdata():
136
+ print("Loading sotopia data...")
137
+ bench_data = load_dataset("cmu-lti/sotopia", split="test")
138
+ return bench_data
139
+
140
+ def load_benchdata_dict():
141
+ print("Loading sotopia data....")
142
+ bench_data = load_dataset("cmu-lti/sotopia", data_files="sotopia_episodes_v1_hf.jsonl")['train']
143
+ id_to_data = {}
144
+ for item in bench_data:
145
+ id_to_data[item["session_id"]] = item
146
+ return id_to_data
147
+
148
+ def load_eval_results():
149
+ print("Loading sotopia Evaluation data...")
150
+ eval_results = load_dataset("WildEval/sotopia-Evaluation", "all", split="train")
151
+ return eval_results
152
+
153
+ def load_infer_results(model_name):
154
+ print(f"Loading sotopia Results for {model_name}...")
155
+ infer_results = load_dataset("WildEval/sotopia-Results", model_name, split="train")
156
+ return infer_results
157
+
158
+ def sample_an_eval_result(eval_results, model_list=[], tag_list=[]):
159
+ global id_to_data
160
+ eval_results = list(eval_results)
161
+ random.shuffle(eval_results)
162
+ for eval_item in eval_results:
163
+ # print(json.dumps(eval_item, indent=2))
164
+ # print(f"## Session ID: {eval_item['session_id']}")
165
+ # eval_item["eval_id"]
166
+ assignment = eval_item['assignment']
167
+ model_1, model_2 = eval_item['model_1'], eval_item['model_2']
168
+ model_A = model_1 if assignment['A'] == model_1 else model_2
169
+ model_B = model_2 if assignment['B'] == model_2 else model_1
170
+ if len(model_list) >= 2:
171
+ if model_A not in model_list or model_B not in model_list:
172
+ continue
173
+ elif len(model_list) == 1:
174
+ if model_A != model_list[0] and model_B != model_list[0]:
175
+ continue
176
+ else:
177
+ pass
178
+ if tag_list:
179
+ if set(tag_list).isdisjoint(set(eval_item['tags'])):
180
+ continue
181
+ winner = eval_item['winner']
182
+ # print(f"## Model A: {model_A} | Model B: {model_B} | Winner: {winner}")
183
+ task_type = eval_item['tags'][0] # primary task type
184
+ chat_history = eval_item['history']
185
+ last_query = eval_item['last_query']
186
+ # print(f"## Task Type: {task_type}")
187
+ # print(f"## Chat History: {chat_history}")
188
+ # print(f"## Last Query --> USER: {last_query}")
189
+
190
+ model_A_output = eval_item['model_1_output'] if model_1 == model_A else eval_item['model_2_output']
191
+ model_B_output = eval_item['model_2_output'] if model_2 == model_B else eval_item['model_1_output']
192
+
193
+ if len(model_A_output.strip()) == 0 or len(model_B_output.strip()) == 0:
194
+ continue
195
+
196
+ conversation_input = id_to_data[eval_item['session_id']]["conversation_input"]
197
+ # print(f"\n\n\n## Model A ({model_A}) Output ##\n{model_A_output}")
198
+ # print(f"\n\n\n## Model B ({model_B}) Output ##\n{model_B_output}")
199
+
200
+ # print(f"\n\n\n## Winner ##\n{winner}")
201
+ # print(f"\n\n\n## GPT-4 Judgement ##\n{eval_item['parsed_result']}")
202
+
203
+ result_dict = {
204
+ "session_id": eval_item['session_id'],
205
+ "model_A": model_A,
206
+ "model_B": model_B,
207
+ "winner": winner,
208
+ "intent": id_to_data[eval_item['session_id']]["intent"],
209
+ "task_type": task_type,
210
+ "all_tags": eval_item['tags'],
211
+ "chat_history": chat_history,
212
+ "last_query": last_query,
213
+ "conversation_input": conversation_input,
214
+ "model_A_output": model_A_output,
215
+ "model_B_output": model_B_output,
216
+ "reason": eval_item['parsed_result']["reason"],
217
+ "choice": eval_item['parsed_result']["choice"],
218
+ "checklist": id_to_data[eval_item['session_id']]["checklist"],
219
+ }
220
+ break
221
+ return result_dict
222
+
223
+ #id_to_data = load_benchdata_dict()
start_app.sh CHANGED
@@ -1,4 +1,4 @@
1
  export OPENAI_API_KEY=$(cat openai_api.key)
2
  export HF_TOKEN=$(cat hf_token.key)
3
 
4
- python app.py
 
1
  export OPENAI_API_KEY=$(cat openai_api.key)
2
  export HF_TOKEN=$(cat hf_token.key)
3
 
4
+ gradio app.py
ui_constants.py ADDED
@@ -0,0 +1,191 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+
3
+ DEFAULT_LP = 0.5
4
+
5
+ banner_url = "https://github.com/sotopia-lab/sotopia-website/blob/main/public/bg_xl.png" # the same repo here.
6
+ BANNER = f'<div style="display: flex; justify-content: flex-start;"><img src="{banner_url}" alt="Banner" style="width: 40vw; min-width: 300px; max-width: 800px;"> </div>'
7
+
8
+ TITLE = "<html> <head> <style> h1 {text-align: center;} </style> </head> <body> <h1> 🦁 AI2 sotopia Leaderboard </b> </body> </html>"
9
+
10
+ WINRATE_HEATMAP = "<div><img src='https://github.com/WildEval/sotopia-Leaderboard/blob/main/gradio/pairwise_win_fractions.png?raw=true' style='width:100%;'></div>"
11
+
12
+ CITATION_TEXT = """@inproceedings{
13
+ zhou2024sotopia,
14
+ title={{SOTOPIA}: Interactive Evaluation for Social Intelligence in Language Agents},
15
+ author={Xuhui Zhou and Hao Zhu and Leena Mathur and Ruohong Zhang and Haofei Yu and Zhengyang Qi and Louis-Philippe Morency and Yonatan Bisk and Daniel Fried and Graham Neubig and Maarten Sap},
16
+ booktitle={The Twelfth International Conference on Learning Representations},
17
+ year={2024},
18
+ url={https://openreview.net/forum?id=mM7VurbA4r}
19
+ }
20
+ """
21
+
22
+
23
+ column_names = {
24
+ "model name ": "Model",
25
+ "elo overall": "Overall Elo",
26
+ 'Information seeking': 'InfoSek',
27
+ 'Creative Writing': 'CrtWrt',
28
+ 'Coding & Debugging': 'Code',
29
+ 'Reasoning': 'Reason',
30
+ 'Editing': 'Edit',
31
+ 'Math': 'Math',
32
+ 'Planning': 'Plan',
33
+ 'Brainstorming': 'Brnstrm',
34
+ 'Role playing': 'RolPly',
35
+ 'Advice seeking': 'AdvSek',
36
+ 'Data Analysis': 'DataAna',
37
+ 'Others': 'Misc',
38
+ "average": "Task-Avg Elo",
39
+ }
40
+
41
+ all_task_types = [
42
+ 'Information seeking',
43
+ 'Creative Writing',
44
+ 'Coding & Debugging',
45
+ 'Reasoning',
46
+ 'Editing',
47
+ 'Math',
48
+ 'Planning',
49
+ 'Brainstorming',
50
+ 'Role playing',
51
+ 'Advice seeking',
52
+ 'Data Analysis',
53
+ 'Others'
54
+ ]
55
+
56
+
57
+
58
+ js_light = """
59
+ function refresh() {
60
+ const url = new URL(window.location);
61
+ if (url.searchParams.get('__theme') !== 'light') {
62
+ url.searchParams.set('__theme', 'light');
63
+ window.location.href = url.href;
64
+ }
65
+ }
66
+ """
67
+
68
+ js_code = """
69
+ function scroll_top() {
70
+ console.log("Hello from Gradio!");
71
+ const bubbles = document.querySelectorAll('.bubble-wrap');
72
+ bubbles.forEach((bubble, index) => {
73
+ setTimeout(() => {
74
+ bubble.scrollTop = 0;
75
+ }, index * 100); // Delay of 100ms between each iteration
76
+ });
77
+ }
78
+ """
79
+
80
+
81
+ TASK_TYPE_STR = "**Tasks**: Info seeking (**InfoSek**), Creative Writing (**CrtWrt**), Coding&Debugging (**Code**), Reasoning (**Reason**), Editing (**Edit**), **Math**, Planning (**Plan**), Brainstorming (**Brnstrm**), Role playing (**RolPly**), Advice seeking (**AdvSek**), Data Analysis (**DataAna**)"
82
+
83
+ css = """
84
+ code {
85
+ font-size: large;
86
+ }
87
+ footer {visibility: hidden}
88
+ .top-left-LP{
89
+ margin-top: 6px;
90
+ margin-left: 5px;
91
+ }
92
+ .markdown-text{font-size: 14pt}
93
+ .markdown-text-small{font-size: 13pt}
94
+ .markdown-text-tiny{font-size: 12pt}
95
+ .markdown-text-tiny-red{
96
+ font-size: 12pt;
97
+ color: red;
98
+ background-color: yellow;
99
+ font-color: red;
100
+ font-weight: bold;
101
+ }
102
+ th {
103
+ text-align: center;
104
+ font-size: 17px; /* Adjust the font size as needed */
105
+ }
106
+ td {
107
+ font-size: 15px; /* Adjust the font size as needed */
108
+ text-align: center;
109
+ }
110
+ .sample_button{
111
+ border: 1px solid #000000;
112
+ border-radius: 5px;
113
+ padding: 5px;
114
+ font-size: 15pt;
115
+ font-weight: bold;
116
+ margin: 5px;
117
+ }
118
+ .chat-common{
119
+ height: auto;
120
+ max-height: 400px;
121
+ min-height: 100px;
122
+ }
123
+ .chat-specific{
124
+ height: auto;
125
+ max-height: 600px;
126
+ min-height: 200px;
127
+ }
128
+ #od-benchmark-tab-table-button{
129
+ font-size: 15pt;
130
+ font-weight: bold;
131
+ }
132
+ .btn_boderline{
133
+ border: 1px solid #000000;
134
+ border-radius: 5px;
135
+ padding: 5px;
136
+ margin: 5px;
137
+ font-size: 15pt;
138
+ font-weight: bold;
139
+ }
140
+ .btn_boderline_next{
141
+ border: 0.1px solid #000000;
142
+ border-radius: 5px;
143
+ padding: 5px;
144
+ margin: 5px;
145
+ font-size: 15pt;
146
+ font-weight: bold;
147
+ }
148
+ .btn_boderline_gray{
149
+ border: 0.5px solid gray;
150
+ border-radius: 5px;
151
+ padding: 5px;
152
+ margin: 5px;
153
+ font-size: 15pt;
154
+ font-weight: italic;
155
+ }
156
+ .btn_boderline_selected{
157
+ border: 2px solid purple;
158
+ background-color: #f2f2f2;
159
+ border-radius: 5px;
160
+ padding: 5px;
161
+ margin: 5px;
162
+ font-size: 15pt;
163
+ font-weight: bold;
164
+ }
165
+ .accordion-label button span{
166
+ font-size: 14pt;
167
+ font-weight: bold;
168
+ }
169
+ #select-models span{
170
+ font-size: 10pt;
171
+ }
172
+ #select-tasks span{
173
+ font-size: 10pt;
174
+ }
175
+ .markdown-text-details{
176
+ margin: 10px;
177
+ padding: 10px;
178
+ }
179
+ button.selected[role="tab"][aria-selected="true"] {
180
+ font-size: 18px; /* or any other size you prefer */
181
+ font-weight: bold;
182
+ }
183
+ #od-benchmark-tab-table-ablation-button {
184
+ font-size: larger; /* Adjust the font size as needed */
185
+ }
186
+ .plotly-plot{
187
+ height: auto;
188
+ max-height: 600px;
189
+ min-height: 600px;
190
+ }
191
+ """