Spaces:

per
/

benchbench

Running

App Files Files Community

Yotam-Perlitz commited on Aug 27

Commit

e2be414

•

1 Parent(s): 1035432

add upload benchmark option

Browse files

Signed-off-by: Yotam-Perlitz <y.perlitz@ibm.com>

Files changed (1) hide show

app.py +153 -80

app.py CHANGED Viewed

@@ -7,28 +7,36 @@ import streamlit as st
 from bat import Benchmark, Config, Reporter, Tester
 from bat.utils import get_holistic_benchmark
-benchmarks_dict = {
-    "arena_elo": "LMSys Arena",
-    "mt_bench": "MT Bench",
-    "mixeval": "Mix Eval",
-    "alpacav2": "AlpacaEval V2",
-    "arena_hard": "Arena Hard",
-    "arc_c": "ARC-C",
-    "eq_benchv2": "EQ Bench V2",
-    "agieval": "AGIEval",
-    "llmonitor": "LLMonitor",
-    "bbh": "BBH",
-    "mmlu": "MMLU",
-    "alpacav1": "AlpacaEval V1",
-    "magi": "MAGI",
-    "alpacaeval2_lc": "AlpacaEval V2 Length Adjusted",
-    "gpt4all": "GPT-4-All",
-    "humaneval": "HumanEval",
-    "mbpp": "MBPP",
-    "hellaswag": "HellaSwag",
-    "hugging_6": "HF OpenLLM V1",
-    "winogrande": "Winogrande",
-}
 st.markdown(
     """<h1 style='text-align: center; color: black;'>🏋️‍♂️ BenchBench Leaderboard 🏋️‍♂️</h1>""",
@@ -47,46 +55,57 @@ st.subheader("The Leaderboard", divider=True)
 # st.subheader("🏋️‍♂️ BenchBench Leaderboard 🏋", divider=True)
 leftcol, rightcol = st.columns([2, 1])
-with leftcol:
-    with st.expander("Leaderboard configurations (defaults are great BTW)", icon="⚙️"):
-        with st.form("my_form"):
-            all_scenarios_for_aggragate_with_all = all_scenarios_for_aggragate.tolist()
-            all_scenarios_for_aggragate_with_all.append("All Holistic")
-            aggragate_scenarios = st.multiselect(
-                "Scenarios in Aggregate",
-                all_scenarios_for_aggragate_with_all,
-                ["All Holistic"],
-                # all_scenarios_for_aggragate,
-            )
-            corr_type = st.selectbox(
-                label="Select Correlation type", options=["kendall", "pearson"], index=0
-            )
-            aggragate_scenario_blacklist = (
-                [
-                    scen
-                    for scen in all_scenarios_for_aggragate
-                    if scen not in aggragate_scenarios
-                ]
-                if "All Holistic" not in aggragate_scenarios
-                else []
-            )
-            model_select_strategy = st.selectbox(
-                label="Select strategy",
-                options=["random", "top_aggregate", "somewhere_aggregate"],
-                index=0,
-            )
-            n_models_taken_list = [5]
-            n_exps = 10
-            submitted = st.form_submit_button(label="Run BAT")
-with rightcol:
-    st.button("➕ Add your benchmark here!")
 def run_load(
@@ -95,6 +114,8 @@ def run_load(
     model_select_strategy_list=["random"],
     corr_types=["kendall"],
     n_exps=10,
 ):
     # Create a hash of the inputs to generate a unique cache file for each set of inputs
     input_str = (
@@ -104,6 +125,14 @@ def run_load(
         + str(corr_types)
         + str(n_exps)
     )
     input_hash = hashlib.md5(input_str.encode()).hexdigest()
     cache_file = f"agreements_cache_{input_hash}.csv"
@@ -112,7 +141,7 @@ def run_load(
     cache_path = os.path.join(cache_dir, cache_file)
     # Check if the cache file exists
-    if os.path.exists(cache_path):
         print("Loading cached results...")
         agreements = pd.read_csv(cache_path)
         return agreements
@@ -126,11 +155,33 @@ def run_load(
             model_select_strategy_list=model_select_strategy_list,
             corr_types=corr_types,
             n_exps=n_exps if n_models_taken_list != [0] else 1,
-            # reference_data_path="data/combined_holistic.csv",
         )
-        holistic = get_holistic_benchmark()
-        holistic_scenarios = holistic.get_scenarios()
         holistic.clear_repeated_scenarios()
         holistic.add_aggragete(
             new_col_name="aggregate",
@@ -139,16 +190,18 @@ def run_load(
             min_scenario_for_models_to_appear_in_agg=5,
         )
-        allbench = Benchmark(
-            pd.read_csv("assets/combined_20240704.csv"),
-            # data_source=newbench_name,
-        )
         allbench.df = allbench.df.drop(columns=["tag"])
         allbench.clear_repeated_scenarios()
         allbench.df = allbench.df.query("scenario not in @holistic_scenarios")
-        allbench.df = allbench.df[~allbench.df["scenario"].str.contains("_mixed")]
-        allbench.df = allbench.df[~allbench.df["scenario"].str.contains("agentbench")]
         # st.dataframe(holistic.df.query('scenario=="aggregate"'))
@@ -158,6 +211,10 @@ def run_load(
         # len(allbench.get_scenario_appearences_count().keys())
         agreements = tester.all_vs_all_agreement_testing(
             allbench, single_source_scenario="aggregate"
         )
@@ -173,8 +230,12 @@ agreements = run_load(
     model_select_strategy_list=[model_select_strategy],
     corr_types=[corr_type],
     n_exps=n_exps,
 )
 reporter = Reporter()
 z_scores = reporter.get_all_z_scores(agreements=agreements, aggragate_name="aggregate")
@@ -201,17 +262,29 @@ data = (
 data = data[~data["Source"].str.contains("livebench")]
 data = data[~data["Source"].str.contains("biggen")]
-data.drop(columns=["Source"], inplace=True)
-data["Benchmark"] = data["Benchmark"].apply(lambda x: benchmarks_dict[x])
 # Apply coloring based on 'Z' valuesz
-styled_data = data.style.background_gradient(
-    subset=["Z Score"],
-    cmap="RdYlGn",
-    vmin=-data["Z Score"].abs().max(),
-    vmax=data["Z Score"].abs().max(),
-).format(subset=["Z Score", corr_name, "p value of Corr."], formatter="{:.2}")
 st.dataframe(
     data=styled_data,

 from bat import Benchmark, Config, Reporter, Tester
 from bat.utils import get_holistic_benchmark
+def get_nice_benchmark_name(bench_name):
+    benchmarks_dict = {
+        "arena_elo": "LMSys Arena",
+        "mt_bench": "MT Bench",
+        "mixeval": "Mix Eval",
+        "alpacav2": "AlpacaEval V2",
+        "arena_hard": "Arena Hard",
+        "arc_c": "ARC-C",
+        "eq_benchv2": "EQ Bench V2",
+        "agieval": "AGIEval",
+        "llmonitor": "LLMonitor",
+        "bbh": "BBH",
+        "mmlu": "MMLU",
+        "alpacav1": "AlpacaEval V1",
+        "magi": "MAGI",
+        "alpacaeval2_lc": "AlpacaEval V2 Length Adjusted",
+        "gpt4all": "GPT-4-All",
+        "humaneval": "HumanEval",
+        "mbpp": "MBPP",
+        "hellaswag": "HellaSwag",
+        "hugging_6": "HF OpenLLM V1",
+        "winogrande": "Winogrande",
+    }
+    if bench_name in benchmarks_dict:
+        return benchmarks_dict[bench_name]
+    else:
+        return bench_name
 st.markdown(
     """<h1 style='text-align: center; color: black;'>🏋️‍♂️ BenchBench Leaderboard 🏋️‍♂️</h1>""",
 # st.subheader("🏋️‍♂️ BenchBench Leaderboard 🏋", divider=True)
 leftcol, rightcol = st.columns([2, 1])
+with st.expander("Leaderboard configurations (defaults are great BTW)", icon="⚙️"):
+    with st.form("my_form"):
+        all_scenarios_for_aggragate_with_all = all_scenarios_for_aggragate.tolist()
+        all_scenarios_for_aggragate_with_all.append("All Holistic")
+        aggragate_scenarios = st.multiselect(
+            "Scenarios in Aggregate",
+            all_scenarios_for_aggragate_with_all,
+            ["All Holistic"],
+            # all_scenarios_for_aggragate,
+        )
+        corr_type = st.selectbox(
+            label="Select Correlation type", options=["kendall", "pearson"], index=0
+        )
+        aggragate_scenario_blacklist = (
+            [
+                scen
+                for scen in all_scenarios_for_aggragate
+                if scen not in aggragate_scenarios
+            ]
+            if "All Holistic" not in aggragate_scenarios
+            else []
+        )
+        model_select_strategy = st.selectbox(
+            label="Select strategy",
+            options=["random", "top_aggregate", "somewhere_aggregate"],
+            index=0,
+        )
+        n_models_taken_list = [5]
+        n_exps = 10
+        submitted = st.form_submit_button(label="Run BAT")
+uploaded_file = st.file_uploader("add your benchmark as a CSV")
+st.download_button(
+    label="Download example CSV",
+    data=pd.read_csv("assets/mybench.csv").to_csv().encode("utf-8"),
+    file_name="mybench.csv",
+    mime="text/csv",
+)
+my_benchmark = Benchmark()
+if uploaded_file is not None:
+    df = pd.read_csv(uploaded_file)
+    my_benchmark.assign_df(df, data_source="Uploaded Benchmark")
 def run_load(
     model_select_strategy_list=["random"],
     corr_types=["kendall"],
     n_exps=10,
+    my_benchmark=Benchmark(),
+    use_caching=False,
 ):
     # Create a hash of the inputs to generate a unique cache file for each set of inputs
     input_str = (
         + str(corr_types)
         + str(n_exps)
     )
+    if not my_benchmark.is_empty:
+        input_str += str(
+            hashlib.sha256(
+                my_benchmark.df.to_csv(index=False).encode("utf-8")
+            ).hexdigest()
+        )
     input_hash = hashlib.md5(input_str.encode()).hexdigest()
     cache_file = f"agreements_cache_{input_hash}.csv"
     cache_path = os.path.join(cache_dir, cache_file)
     # Check if the cache file exists
+    if os.path.exists(cache_path) and use_caching:
         print("Loading cached results...")
         agreements = pd.read_csv(cache_path)
         return agreements
             model_select_strategy_list=model_select_strategy_list,
             corr_types=corr_types,
             n_exps=n_exps if n_models_taken_list != [0] else 1,
         )
+        holistic_scenarios = [
+            "arena_hard",
+            "mixeval",
+            "agieval",
+            "arc_c",
+            "alpacav1",
+            "alpacav2",
+            "alpacaeval2_lc",
+            "arena_elo",
+            "bbh",
+            "eq_benchv2",
+            "gpt4all",
+            "hugging_6",
+            "llmonitor",
+            "magi",
+            "mmlu",
+            "mt_bench",
+            "biggen_mwr",
+            "olmes_average",
+            "mmlu_pro",
+        ]
+        holistic = Benchmark()
+        holistic.load_local_catalog()
+        holistic.df = holistic.df.query("scenario in @holistic_scenarios")
         holistic.clear_repeated_scenarios()
         holistic.add_aggragete(
             new_col_name="aggregate",
             min_scenario_for_models_to_appear_in_agg=5,
         )
+        allbench = Benchmark()
+        allbench.load_local_catalog()
+        # allbench.df = allbench.df[~allbench.df["source"].str.contains("livebench")]
+        allbench.extend(my_benchmark)
         allbench.df = allbench.df.drop(columns=["tag"])
         allbench.clear_repeated_scenarios()
         allbench.df = allbench.df.query("scenario not in @holistic_scenarios")
+        # allbench.df = allbench.df[~allbench.df["scenario"].str.contains("_mixed")]
+        # allbench.df = allbench.df[~allbench.df["scenario"].str.contains("agentbench")]
         # st.dataframe(holistic.df.query('scenario=="aggregate"'))
         # len(allbench.get_scenario_appearences_count().keys())
+        allbench.df.query('source=="BlueBench"').model.unique()
+        allbench.df.query('scenario=="aggregate"').model.unique()
         agreements = tester.all_vs_all_agreement_testing(
             allbench, single_source_scenario="aggregate"
         )
     model_select_strategy_list=[model_select_strategy],
     corr_types=[corr_type],
     n_exps=n_exps,
+    my_benchmark=my_benchmark,
 )
+if not my_benchmark.is_empty:
+    print()
 reporter = Reporter()
 z_scores = reporter.get_all_z_scores(agreements=agreements, aggragate_name="aggregate")
 data = data[~data["Source"].str.contains("livebench")]
 data = data[~data["Source"].str.contains("biggen")]
+# data.drop(columns=["Source"], inplace=True)
+data["Benchmark"] = data["Benchmark"].apply(lambda x: get_nice_benchmark_name(x))
 # Apply coloring based on 'Z' valuesz
+def highlight_uploaded_benchmark(row):
+    if row["Source"] == "Uploaded Benchmark":
+        return ["background-color: rgba(100,100,100,0.1)"] * len(row)
+    else:
+        return [""] * len(row)
+styled_data = (
+    data.style.background_gradient(
+        subset=["Z Score"],
+        cmap="RdYlGn",
+        vmin=-data["Z Score"].abs().max(),
+        vmax=data["Z Score"].abs().max(),
+    )
+    .format(subset=["Z Score", corr_name, "p value of Corr."], formatter="{:.2}")
+    .apply(highlight_uploaded_benchmark, axis=1)
+)
 st.dataframe(
     data=styled_data,