human_eval_llm_leaderboard

Runtime error

File size: 5,496 Bytes

import json
import os
from datetime import datetime, timezone


import gradio as gr
import numpy as np
import pandas as pd
from apscheduler.schedulers.background import BackgroundScheduler
from huggingface_hub import HfApi

from src.assets.text_content import *
from src.elo_leaderboard.load_results import get_elo_plots, get_elo_results_dicts
from src.assets.css_html_js import custom_css, get_window_url_params # left in case you need them
from src.utils_display import EloEvalColumn, fields, styled_error, styled_warning, styled_message
from src.init import load_all_info_from_hub

# clone / pull the lmeh eval data
H4_TOKEN = os.environ.get("H4_TOKEN", None)
HUMAN_EVAL_REPO = "HuggingFaceH4/scale-human-eval"
GPT_4_EVAL_REPO = "HuggingFaceH4/open_llm_leaderboard_oai_evals"
IS_PUBLIC = bool(os.environ.get("IS_PUBLIC", True))
ADD_PLOTS = False

EVAL_REQUESTS_PATH = "auto_evals/eval_requests"

api = HfApi()


def restart_space():
    api.restart_space(
        repo_id="HuggingFaceH4/open_llm_leaderboard", token=H4_TOKEN
    )

human_eval_repo, gpt_4_eval_repo = load_all_info_from_hub(HUMAN_EVAL_REPO, GPT_4_EVAL_REPO)

ELO_COLS = [c.name for c in fields(EloEvalColumn)]
ELO_TYPES = [c.type for c in fields(EloEvalColumn)]
ELO_SORT_COL = EloEvalColumn.gpt4.name


def has_no_nan_values(df, columns):
    return df[columns].notna().all(axis=1)


def has_nan_values(df, columns):
    return df[columns].isna().any(axis=1)


def get_elo_leaderboard(df_instruct, df_code_instruct, tie_allowed=False):
    if human_eval_repo:
        print("Pulling human_eval_repo changes")
        human_eval_repo.git_pull()

    all_data = get_elo_results_dicts(df_instruct, df_code_instruct, tie_allowed)
    dataframe = pd.DataFrame.from_records(all_data)
    dataframe = dataframe.sort_values(by=ELO_SORT_COL, ascending=False)
    dataframe = dataframe[ELO_COLS]
    return dataframe


def get_elo_elements():
    df_instruct = pd.read_json("human_evals/without_code.json")
    df_code_instruct = pd.read_json("human_evals/with_code.json")

    elo_leaderboard = get_elo_leaderboard(
        df_instruct, df_code_instruct, tie_allowed=False
    )
    elo_leaderboard_with_tie_allowed = get_elo_leaderboard(
        df_instruct, df_code_instruct, tie_allowed=True
    )
    plot_1, plot_2, plot_3, plot_4 = get_elo_plots(
        df_instruct, df_code_instruct, tie_allowed=False
    )

    return (
        elo_leaderboard,
        elo_leaderboard_with_tie_allowed,
        plot_1,
        plot_2,
        plot_3,
        plot_4,
    )

(
    elo_leaderboard,
    elo_leaderboard_with_tie_allowed,
    plot_1,
    plot_2,
    plot_3,
    plot_4,
) = get_elo_elements()


demo = gr.Blocks(css=custom_css)
with demo:
    gr.HTML(TITLE)
    with gr.Row():
        gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")

    with gr.Column():
        with gr.Row():
            with gr.Column(scale=2):
                gr.Markdown(HUMAN_GPT_EVAL_TEXT, elem_classes="markdown-text")
            with gr.Column(scale=1):
                gr.Image(
                    "src/assets/scale-hf-logo.png", elem_id="scale-logo", show_label=False
                )
        gr.Markdown("## No tie allowed")
        elo_leaderboard_table = gr.components.Dataframe(
            value=elo_leaderboard,
            headers=ELO_COLS,
            datatype=ELO_TYPES,
            max_rows=5,
        )

        gr.Markdown("## Tie allowed*")
        elo_leaderboard_table_with_tie_allowed = gr.components.Dataframe(
            value=elo_leaderboard_with_tie_allowed,
            headers=ELO_COLS,
            datatype=ELO_TYPES,
            max_rows=5,
        )

        gr.Markdown(
            "\* Results when the scores of 4 and 5 were treated as ties.",
            elem_classes="markdown-text",
        )

        gr.Markdown(
            "Let us know in [this discussion](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/65) which models we should add!",
            elem_id="models-to-add-text",
        )

    if ADD_PLOTS:
        with gr.Box():
            visualization_title = gr.HTML(VISUALIZATION_TITLE)
            with gr.Row():
                with gr.Column():
                    gr.Markdown(f"#### Figure 1: {PLOT_1_TITLE}")
                    plot_1 = gr.Plot(plot_1, show_label=False)
                with gr.Column():
                    gr.Markdown(f"#### Figure 2: {PLOT_2_TITLE}")
                    plot_2 = gr.Plot(plot_2, show_label=False)
            with gr.Row():
                with gr.Column():
                    gr.Markdown(f"#### Figure 3: {PLOT_3_TITLE}")
                    plot_3 = gr.Plot(plot_3, show_label=False)
                with gr.Column():
                    gr.Markdown(f"#### Figure 4: {PLOT_4_TITLE}")
                    plot_4 = gr.Plot(plot_4, show_label=False)

    with gr.Row():
        with gr.Column():
            with gr.Accordion("📙 Citation", open=False):
                citation_button = gr.Textbox(
                    value=CITATION_BUTTON_TEXT,
                    label=CITATION_BUTTON_LABEL,
                    elem_id="citation-button",
                ).style(show_copy_button=True)
        with gr.Column():
            with gr.Accordion("✨ CHANGELOG", open=False):
                changelog = gr.Markdown(CHANGELOG_TEXT, elem_id="changelog-text")



scheduler = BackgroundScheduler()
scheduler.add_job(restart_space, "interval", seconds=3600)
scheduler.start()
demo.queue(concurrency_count=40).launch()