Spaces:

allenai
/

ZebraLogic

Running

App Files Files Community

yuchenlin commited on Jul 11

Commit

1757118

•

1 Parent(s): 1c919b3

formatting

Browse files

Files changed (6) hide show

.gitignore +2 -5
app.py +31 -34
data_utils.py +2 -5
model_info.json +1 -0
update_data.sh +4 -40
utils_display.py +6 -1

.gitignore CHANGED Viewed

@@ -1,6 +1,3 @@
-*.pyc
-ZeroEval-main/.DS_Store
-ZeroEval-main/result_dirs/.DS_Store
-ZeroEval-main/result_dirs/zebra-grid/.DS_Store
-.DS_Store


1
2	+ *.pyc
3	+ *.DS_Store

app.py CHANGED Viewed

@@ -37,59 +37,56 @@ with open("_metrics.md", "r") as f:
 original_df = None
 # available_models = [] # to be filled in later
 available_models = list(model_info.keys())
 def _tab_leaderboard():
-    global original_df, available_models, gpt4t_dfs, haiku_dfs, llama_dfs, score_df
     with gr.TabItem("📊 Main", elem_id="od-benchmark-tab-table-ablation", id=0, elem_classes="subtab"):
         default_main_df = original_df.copy()
-        default_main_df.insert(0, "", range(1, 1 + len(default_main_df)))
-        default_main_df_no_task = default_main_df.copy()
-        # default_main_df_no_task = hide_task_column(default_main_df)
-        # default_main_df_no_task = rerank(default_main_df_no_task, rank_column=WB_ELO_COLUMN)
-        # default_main_df_no_task = rerank(default_main_df_no_task, rank_column=HYBRID_AVG_COLUMN)
-        with gr.Row():
-            # with gr.Column(scale=5):
-                # gr.Markdown(LEADERBOARD_REMARKS_MAIN, elem_classes="markdown-text-small top-left-LP")
-                # with gr.Row():
-                    # with gr.Column(scale=2):
-                        # md = gr.Markdown(" ### 👀 More presentation options ⬇️", elem_classes="markdown-text")
-                    # with gr.Column(scale=3):
-            # with gr.Column(scale=2):
-                # gr.Markdown(f"""**__🪧  Default options:__**  K={DEFAULT_K}; Hybrid-Macro; for best corr w/ LMSYS Elo.""", elem_classes="markdown-text")
-                # gr.Markdown(LENGTH_MARGIN_DESC_MD, elem_classes="markdown-text-tiny no_margin")
-            with gr.Column(scale=5):
-                with gr.Accordion("💬 Metric explanations", open=False, elem_classes="accordion-label"):
-                    gr.Markdown(LEADERBOARD_REMARKS_MAIN, elem_classes="markdown-text-small no_margin")
-                rank_column_radio = gr.Radio(["🆚+💯 Hybrid", "🆚 Reward-Mix (Pairwise)", "💯 Score (Individual)", "🌟 WB Elo (beta)" ], show_label=False, elem_id="rank-column-radio",
-                                             value="🌟 WB Elo (beta)"
-                                            # value="🆚+💯 Hybrid"
-                                             )
-            with gr.Column(scale=2):
-                with gr.Row():
-                    checkbox_show_task_categorized = gr.Checkbox(label="🆚 by Task Type", elem_id="show-task-categorized", value=False)
-                    show_open_source_model_only = gr.Checkbox(label="🔑 Open Models", elem_id="show-open-source-models", value=False)
         # with gr.Row():
         #     with gr.Column(scale=2):
         leaderboard_table = gr.components.Dataframe(
-            value=default_main_df_no_task,
             datatype= ["number", "markdown", "markdown", "number"],
             # max_rows=None,
             height=6000,
             elem_id="leaderboard-table",
             interactive=False,
             visible=True,
-            column_widths=[50, 260,120, 120, 120, 130,100,100,110,100],
             wrap=True
             # min_width=60,
         )
         # checkbox_show_task_categorized.change(fn=length_margin_change, inputs=[length_margin_choices, gr.Text("main", visible=False), checkbox_show_task_categorized, show_open_source_model_only, rank_column_radio], outputs=[leaderboard_table])
         # show_open_source_model_only.change(fn=length_margin_change, inputs=[length_margin_choices, gr.Text("main", visible=False), checkbox_show_task_categorized, show_open_source_model_only, rank_column_radio], outputs=[leaderboard_table])
         # rank_column_radio.change(fn=length_margin_change, inputs=[length_margin_choices, gr.Text("main", visible=False), checkbox_show_task_categorized, show_open_source_model_only, rank_column_radio], outputs=[leaderboard_table])
 def _tab_submit():

 original_df = None
 # available_models = [] # to be filled in later
 available_models = list(model_info.keys())
+def df_filters(mode_selection_radio, show_open_source_model_only):
+    global original_df
+    # remove the rows when the model contains "❌"
+    original_df = original_df[~original_df["Model"].str.contains("❌")]
+    modes = {
+        "greedy": ["greedy"],
+        "sampling (Temp=0.5)": ["sampling"],
+        "all": ["greedy", "sampling"]
+    }
+    # filter the df by the mode_selection_radio
+    default_main_df = original_df[original_df["Mode"].isin(modes[mode_selection_radio])]
+    default_main_df.insert(0, "", range(1, 1 + len(default_main_df)))
+    return default_main_df.copy()
+def _gstr(text):
+    return gr.Text(text, visible=False)
 def _tab_leaderboard():
+    global original_df, available_models
     with gr.TabItem("📊 Main", elem_id="od-benchmark-tab-table-ablation", id=0, elem_classes="subtab"):
         default_main_df = original_df.copy()
+        # default_main_df.insert(0, "", range(1, 1 + len(default_main_df)))
+        # default_main_df_no_task = default_main_df.copy()
+        default_mode = "greedy"
+        default_main_df = df_filters(default_mode, False)
+        with gr.Row():
+            with gr.Column(scale=5):
+                mode_selection_radio = gr.Radio(["greedy", "sampling (Temp=0.5)", "all"], show_label=False, elem_id="rank-column-radio", value=default_mode)
         # with gr.Row():
         #     with gr.Column(scale=2):
         leaderboard_table = gr.components.Dataframe(
+            value=default_main_df,
             datatype= ["number", "markdown", "markdown", "number"],
             # max_rows=None,
             height=6000,
             elem_id="leaderboard-table",
             interactive=False,
             visible=True,
+            column_widths=[50, 260, 100, 100, 120, 120, 100,100,110,100],
             wrap=True
             # min_width=60,
         )
         # checkbox_show_task_categorized.change(fn=length_margin_change, inputs=[length_margin_choices, gr.Text("main", visible=False), checkbox_show_task_categorized, show_open_source_model_only, rank_column_radio], outputs=[leaderboard_table])
         # show_open_source_model_only.change(fn=length_margin_change, inputs=[length_margin_choices, gr.Text("main", visible=False), checkbox_show_task_categorized, show_open_source_model_only, rank_column_radio], outputs=[leaderboard_table])
         # rank_column_radio.change(fn=length_margin_change, inputs=[length_margin_choices, gr.Text("main", visible=False), checkbox_show_task_categorized, show_open_source_model_only, rank_column_radio], outputs=[leaderboard_table])
+        mode_selection_radio.change(fn=df_filters, inputs=[mode_selection_radio, _gstr("")], outputs=[leaderboard_table])
 def _tab_submit():

data_utils.py CHANGED Viewed

@@ -32,11 +32,8 @@ def post_processing(df, column_names, rank_column=RANKING_COLUMN, ordered_column
         if col == "Model" and click_url:
             df[col] = df[col].apply(lambda x: x.replace(x, make_clickable_model(x)))
         else:
-            df[col] = df[col].apply(formatter) # For numerical values
-        if "Elo" in col:
-            df[col] = df[col].replace('-', np.nan).astype(float)
     df.rename(columns=column_names, inplace=True)
     list_columns = [col for col in ordered_columns if col in df.columns]
     df = df[list_columns]

         if col == "Model" and click_url:
             df[col] = df[col].apply(lambda x: x.replace(x, make_clickable_model(x)))
         else:
+            df[col] = df[col].apply(formatter) # For numerical values
     df.rename(columns=column_names, inplace=True)
     list_columns = [col for col in ordered_columns if col in df.columns]
     df = df[list_columns]

model_info.json CHANGED Viewed

@@ -53,6 +53,7 @@
     "deepseek-coder": {"pretty_name": "DeepSeek-Coder-V2", "hf_model_id": "https://platform.deepseek.com/api-docs/api/deepseek-api/", "open": true},
     "gemma-2-27b-it@nvidia": {"pretty_name": "Gemma-2-27B-it", "hf_model_id": "https://huggingface.co/google/gemma-2-27b-it"},
     "gemma-2-9b-it@nvidia": {"pretty_name": "Gemma-2-9B-it", "hf_model_id": "https://huggingface.co/google/gemma-2-9b-it"},
     "neo_7b_instruct_v0.1": {"pretty_name": "Neo-7B-Instruct", "hf_model_id": "m-a-p/neo_7b_instruct_v0.1"},
     "Yi-34B-Chat": {"pretty_name": "Yi-34B-Chat", "hf_model_id": "01-ai/Yi-34B-chat"},
     "vicuna-13b-v1.5": {"pretty_name": "Vicuna-13b-v1.5", "hf_model_id": "lmsys/vicuna-13b-v1.5"},

     "deepseek-coder": {"pretty_name": "DeepSeek-Coder-V2", "hf_model_id": "https://platform.deepseek.com/api-docs/api/deepseek-api/", "open": true},
     "gemma-2-27b-it@nvidia": {"pretty_name": "Gemma-2-27B-it", "hf_model_id": "https://huggingface.co/google/gemma-2-27b-it"},
     "gemma-2-9b-it@nvidia": {"pretty_name": "Gemma-2-9B-it", "hf_model_id": "https://huggingface.co/google/gemma-2-9b-it"},
+    "gemma-2-9b-it": {"pretty_name": "Gemma-2-9B-it", "hf_model_id": "https://huggingface.co/google/gemma-2-9b-it", "hidden": true},
     "neo_7b_instruct_v0.1": {"pretty_name": "Neo-7B-Instruct", "hf_model_id": "m-a-p/neo_7b_instruct_v0.1"},
     "Yi-34B-Chat": {"pretty_name": "Yi-34B-Chat", "hf_model_id": "01-ai/Yi-34B-chat"},
     "vicuna-13b-v1.5": {"pretty_name": "Vicuna-13b-v1.5", "hf_model_id": "lmsys/vicuna-13b-v1.5"},

update_data.sh CHANGED Viewed

@@ -1,40 +1,4 @@
-TARGET_DIR="ZeroEval-main"
-rm -r $TARGET_DIR
-# Download the ZIP file
-curl -L -o zeroeval.zip https://github.com/yuchenlin/ZeroEval/archive/refs/heads/main.zip
-unzip zeroeval.zip
-rm zeroeval.zip
-#!/bin/bash
-# Define the target directory and the exception folder
-EXCEPTION_FOLDER="result_dirs"
-# Ensure the target directory exists
-if [ -d "$TARGET_DIR" ]; then
-  # Loop through each item in the target directory
-  for item in "$TARGET_DIR"/*; do
-    # Check if it is not the exception folder
-    if [ "$(basename "$item")" != "$EXCEPTION_FOLDER" ]; then
-      # Remove the item (file or directory)
-      rm -rf "$item"
-      echo "Removed: $item"
-    fi
-  done
-else
-  echo "Target directory does not exist: $TARGET_DIR"
-fi
-# only keep the result_dirs/zebra-grid under result_dirs folder; remove all other sub-folders under result_dirs
-# Remove all subdirectories in result_dirs except zebra-grid
-find "$TARGET_DIR/result_dirs" -maxdepth 1 -type d ! -name 'zebra-grid' ! -name 'result_dirs' -exec rm -rf {} +
-rm -rf $TARGET_DIR/.github
-rm -rf $TARGET_DIR/.gitignore
-# tables
-# bash update_table.sh

+# download the file from https://raw.githubusercontent.com/yuchenlin/ZeroEval/main/result_dirs/zebra-grid.summary.json
+# and put it to ZeroEval-main/result_dirs/zebra-grid.summary.json
+mkdir -p ZeroEval-main/result_dirs
+wget https://raw.githubusercontent.com/yuchenlin/ZeroEval/main/result_dirs/zebra-grid.summary.json -O ZeroEval-main/result_dirs/zebra-grid.summary.json

utils_display.py CHANGED Viewed

@@ -7,7 +7,9 @@ def make_clickable_model(model_name):
     global model_info
     modified_model_name = model_name
     if model_name in model_info:
-        if model_info[model_name]["hf_model_id"].startswith("http"):
             link = model_info[model_name]["hf_model_id"]
             modified_model_name = f'🔒 <a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_info[model_name]["pretty_name"]}</a>'
         else:
@@ -16,6 +18,9 @@ def make_clickable_model(model_name):
             if "Neo-7B" in modified_model_name:
                 # models that are fully open source
                 modified_model_name = modified_model_name.replace("🔑", "💎🔑")
     if "🚨</a>" in modified_model_name:
         modified_model_name = modified_model_name.replace(' 🚨</a>', '</a> 🚨')

     global model_info
     modified_model_name = model_name
     if model_name in model_info:
+        is_open_model = model_info[model_name]["hf_model_id"].startswith("http")
+        is_open_model = model_info[model_name].get("open", False)
+        if not is_open_model:
             link = model_info[model_name]["hf_model_id"]
             modified_model_name = f'🔒 <a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_info[model_name]["pretty_name"]}</a>'
         else:
             if "Neo-7B" in modified_model_name:
                 # models that are fully open source
                 modified_model_name = modified_model_name.replace("🔑", "💎🔑")
+        hidden = model_info[model_name].get("hidden", False)
+        if hidden:
+            modified_model_name = f'❌ {modified_model_name}'
     if "🚨</a>" in modified_model_name:
         modified_model_name = modified_model_name.replace(' 🚨</a>', '</a> 🚨')