Yeoni Rhee commited on
Commit
ac011bc
·
1 Parent(s): 79cc507

Sotopia Task Submission

Browse files
Files changed (1) hide show
  1. sotopia_space/benchmark.py +3 -37
sotopia_space/benchmark.py CHANGED
@@ -1,42 +1,13 @@
1
  import gradio as gr # type: ignore
2
  import pandas as pd
3
  from sotopia_space.constants import MODEL_OPTIONS
4
- from sotopia_space.utils import estimated_win_rate, make_clickable_model, styled_error, styled_warning, styled_message,apply_length_penalty
5
 
6
  LP_MODE = "v2"
7
  original_df, ablation_df = None, None
8
  LP_original_dfs = {}
9
  DEFAULT_LP = 0.5
10
 
11
- available_models = [] # to be filled in later
12
- original_df, ablation_df = None, None
13
-
14
- def slider_change_main(length_penalty):
15
- global original_df, ablation_df, LP_MODE
16
- adjusted_df = apply_length_penalty(original_df, ablation_df, length_penalty, mode=LP_MODE, LP_original_dfs=LP_original_dfs)
17
- adjusted_df = adjusted_df[["Model", "Overall Elo", "Task-Avg Elo", "# battles", "Length"]]
18
- adjusted_df = adjusted_df.sort_values(by="Overall Elo", ascending=False)
19
- # adjusted_df = add_winrates(adjusted_df, LP=length_penalty)
20
- # adjusted_df = adjusted_df.drop(columns=["Length"])
21
- adjusted_df.insert(0, "Rank", range(1, 1 + len(adjusted_df)))
22
- return adjusted_df
23
-
24
- def slider_change_full(length_penalty, show_winrate):
25
- global original_df, ablation_df, LP_MODE
26
- adjusted_df = apply_length_penalty(original_df, ablation_df, length_penalty, mode=LP_MODE, LP_original_dfs=LP_original_dfs)
27
- # sort the model by the "Task-Avg Elo" column
28
- adjusted_df = adjusted_df.sort_values(by="Overall Elo", ascending=False)
29
- adjusted_df.drop(columns=["Overall Elo", "Task-Avg Elo", "# battles", "Length"], inplace=True)
30
- if show_winrate == "none":
31
- adjusted_df.insert(0, "Rank", range(1, 1 + len(adjusted_df)))
32
- return adjusted_df
33
- elif show_winrate == "gpt-3.5":
34
- adjusted_df = add_winrates_tasks(adjusted_df, ref="gpt-3.5", LP=length_penalty)
35
- elif show_winrate == "gpt-4":
36
- adjusted_df = add_winrates_tasks(adjusted_df, ref="gpt-4", LP=length_penalty)
37
- adjusted_df.insert(0, "Rank", range(1, 1 + len(adjusted_df)))
38
- return adjusted_df
39
-
40
  def benchmark_table():
41
  global original_df, ablation_df
42
  global LP_original_dfs, LP_MODE
@@ -44,7 +15,6 @@ def benchmark_table():
44
  gr.Markdown(f"**Version**: sotopia (v1.01; 2024.04.22) | **# Examples**: 7200 | **# Models**: {len(MODEL_OPTIONS)} | **# Comparisons**: x", elem_classes="markdown-text")
45
 
46
  with gr.TabItem("Vs GPT-3.5", elem_id="od-benchmark-tab-table-ablation", id=0, elem_classes="subtab"):
47
- # original_df, ablation_df = skip_empty_original_df, skip_empty_ablation_df
48
  original_df = pd.read_json('data_dir/models_vs_gpt35.jsonl', lines=True)
49
  default_main_df = apply_length_penalty(original_df, ablation_df, length_penalty=DEFAULT_LP, mode=LP_MODE, LP_original_dfs=LP_original_dfs)
50
  default_main_df = default_main_df.sort_values(by="GOAL [0, 10]", ascending=False)
@@ -52,10 +22,7 @@ def benchmark_table():
52
  default_main_df.insert(0, "Rank", range(1, 1 + len(default_main_df)))
53
  with gr.Row():
54
  with gr.Column(scale=4):
55
- gr.Markdown("**Vs GPT3.5**: The interlocutors are compared against GPT-3.5, the baseline model.")
56
- with gr.Column(scale=1):
57
- length_penlty_slider = gr.Slider(minimum=0.1, maximum=1, step=0.1, value=DEFAULT_LP, label="Length Penalty", elem_id="length-penalty-slider")
58
- # checkbox_skip_empty = gr.Checkbox(label="Skip empty results", value=False, elem_id="skip-empty-checkbox", scale=2)
59
  TYPES = ["number", "markdown", "number"]
60
  leaderboard_table = gr.components.Dataframe(
61
  value=default_main_df,
@@ -66,5 +33,4 @@ def benchmark_table():
66
  interactive=False,
67
  visible=True,
68
  min_width=60,
69
- )
70
- #length_penlty_slider.change(fn=slider_change_main, inputs=[length_penlty_slider], outputs=[leaderboard_table])
 
1
  import gradio as gr # type: ignore
2
  import pandas as pd
3
  from sotopia_space.constants import MODEL_OPTIONS
4
+ from sotopia_space.utils import apply_length_penalty
5
 
6
  LP_MODE = "v2"
7
  original_df, ablation_df = None, None
8
  LP_original_dfs = {}
9
  DEFAULT_LP = 0.5
10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  def benchmark_table():
12
  global original_df, ablation_df
13
  global LP_original_dfs, LP_MODE
 
15
  gr.Markdown(f"**Version**: sotopia (v1.01; 2024.04.22) | **# Examples**: 7200 | **# Models**: {len(MODEL_OPTIONS)} | **# Comparisons**: x", elem_classes="markdown-text")
16
 
17
  with gr.TabItem("Vs GPT-3.5", elem_id="od-benchmark-tab-table-ablation", id=0, elem_classes="subtab"):
 
18
  original_df = pd.read_json('data_dir/models_vs_gpt35.jsonl', lines=True)
19
  default_main_df = apply_length_penalty(original_df, ablation_df, length_penalty=DEFAULT_LP, mode=LP_MODE, LP_original_dfs=LP_original_dfs)
20
  default_main_df = default_main_df.sort_values(by="GOAL [0, 10]", ascending=False)
 
22
  default_main_df.insert(0, "Rank", range(1, 1 + len(default_main_df)))
23
  with gr.Row():
24
  with gr.Column(scale=4):
25
+ gr.Markdown("**Vs GPT3.5**: The interlocutors are compared against GPT-3.5, the baseline model.")
 
 
 
26
  TYPES = ["number", "markdown", "number"]
27
  leaderboard_table = gr.components.Dataframe(
28
  value=default_main_df,
 
33
  interactive=False,
34
  visible=True,
35
  min_width=60,
36
+ )