import argparse
import ast
import glob
import pickle
import traceback
from datetime import datetime
import pandas as pd
import gradio as gr
import numpy as np
basic_component_values = [None] * 6
leader_component_values = [None] * 5
promo_banner = """
llmarena.ru - ИСПОЛЬЗУЙТЕ БЕСПЛАТНО ПОСЛЕДНИЕ ВЕРСИИ ЛУЧШИХ ЧАТ-БОТОВ НА РУССКОМ
"""
def make_default_md_1():
leaderboard_md = f"""
# 🏆 LLM арена на русском: таблица лидеров
{promo_banner}
"""
return leaderboard_md
def make_default_md_2():
leaderboard_md = f"""
Платформа LLM Arena является открытой краудсорсинговой платформой для оценки больших языковых моделей (LLM) на русском языке. Мы собираем парные сравнения от людей, чтобы ранжировать LLM с помощью модели Брэдли-Терри и отображать рейтинги моделей по шкале Эло.
Chatbot Arena на русском зависит от участия сообщества, пожалуйста, внесите свой вклад, отдав свой голос!
- Чтобы **добавить свою модель** в сравнение - напишите нам в tg: [Группа](https://t.me/+bFEOl-Bdmok4NGUy)
- Если вы **нашли ошибку**, либо у вас **есть предложение** - напишите нам: [Роман](https://t.me/roman_kucev)
"""
return leaderboard_md
def make_arena_leaderboard_md(arena_df, last_updated_time):
total_votes = sum(arena_df["num_battles"])
total_models = len(arena_df)
space = " "
leaderboard_md = f"""
Всего #моделей: **{total_models}**.{space} Всего #голосов: **{"{:,}".format(total_votes)}**.{space} Последнее обновление: {last_updated_time}.
***Ранг (UB)**: рейтинг модели (верхняя граница), определяется как один плюс количество моделей, которые статистически лучше целевой модели.
Модель A статистически лучше модели B, когда нижняя граница оценки модели A больше верхней границы оценки модели B (с доверительным интервалом 95%).
См. Рисунок 1 ниже для визуализации доверительных интервалов оценок моделей.
"""
return leaderboard_md
def make_category_arena_leaderboard_md(arena_df, arena_subset_df, name="Overall"):
total_votes = sum(arena_df["num_battles"])
total_models = len(arena_df)
space = " "
total_subset_votes = sum(arena_subset_df["num_battles"])
total_subset_models = len(arena_subset_df)
leaderboard_md = f"""### {cat_name_to_explanation[name]}
#### {space} #модели: **{total_subset_models} ({round(total_subset_models/total_models *100)}%)** {space} #голоса: **{"{:,}".format(total_subset_votes)} ({round(total_subset_votes/total_votes * 100)}%)**{space}
"""
return leaderboard_md
def model_hyperlink(model_name, link):
return f'{model_name}'
def load_leaderboard_table_csv(filename, add_hyperlink=True):
lines = open(filename).readlines()
heads = [v.strip() for v in lines[0].split(",")]
rows = []
for i in range(1, len(lines)):
row = [v.strip() for v in lines[i].split(",")]
for j in range(len(heads)):
item = {}
for h, v in zip(heads, row):
if h == "Arena Elo rating":
if v != "-":
v = int(ast.literal_eval(v))
else:
v = np.nan
elif h == "MMLU":
if v != "-":
v = round(ast.literal_eval(v) * 100, 1)
else:
v = np.nan
elif h == "MT-bench (win rate %)":
if v != "-":
v = round(ast.literal_eval(v[:-1]), 1)
else:
v = np.nan
elif h == "MT-bench (score)":
if v != "-":
v = round(ast.literal_eval(v), 2)
else:
v = np.nan
item[h] = v
if add_hyperlink:
item["Model"] = model_hyperlink(item["Model"], item["Link"])
rows.append(item)
return rows
def create_ranking_str(ranking, ranking_difference):
if ranking_difference > 0:
return f"{int(ranking)} \u2191"
elif ranking_difference < 0:
return f"{int(ranking)} \u2193"
else:
return f"{int(ranking)}"
def recompute_final_ranking(arena_df):
# compute ranking based on CI
ranking = {}
for i, model_a in enumerate(arena_df.index):
ranking[model_a] = 1
for j, model_b in enumerate(arena_df.index):
if i == j:
continue
if (
arena_df.loc[model_b]["rating_q025"]
> arena_df.loc[model_a]["rating_q975"]
):
ranking[model_a] += 1
return list(ranking.values())
def get_arena_table(arena_df, model_table_df, arena_subset_df=None):
arena_df = arena_df.sort_values(
by=["final_ranking", "rating"], ascending=[True, False]
)
arena_df["final_ranking"] = recompute_final_ranking(arena_df)
arena_df = arena_df.sort_values(
by=["final_ranking", "rating"], ascending=[True, False]
)
# sort by rating
if arena_subset_df is not None:
# filter out models not in the arena_df
arena_subset_df = arena_subset_df[arena_subset_df.index.isin(arena_df.index)]
arena_subset_df = arena_subset_df.sort_values(by=["rating"], ascending=False)
arena_subset_df["final_ranking"] = recompute_final_ranking(arena_subset_df)
# keep only the models in the subset in arena_df and recompute final_ranking
arena_df = arena_df[arena_df.index.isin(arena_subset_df.index)]
# recompute final ranking
arena_df["final_ranking"] = recompute_final_ranking(arena_df)
# assign ranking by the order
arena_subset_df["final_ranking_no_tie"] = range(1, len(arena_subset_df) + 1)
arena_df["final_ranking_no_tie"] = range(1, len(arena_df) + 1)
# join arena_df and arena_subset_df on index
arena_df = arena_subset_df.join(
arena_df["final_ranking"], rsuffix="_global", how="inner"
)
arena_df["ranking_difference"] = (
arena_df["final_ranking_global"] - arena_df["final_ranking"]
)
arena_df = arena_df.sort_values(
by=["final_ranking", "rating"], ascending=[True, False]
)
arena_df["final_ranking"] = arena_df.apply(
lambda x: create_ranking_str(x["final_ranking"], x["ranking_difference"]),
axis=1,
)
arena_df["final_ranking"] = arena_df["final_ranking"].astype(str)
values = []
for i in range(len(arena_df)):
row = []
model_key = arena_df.index[i]
try:
model_name = model_table_df[model_table_df["key"] == model_key][
"Model"
].values[0]
ranking = arena_df.iloc[i].get("final_ranking") or i + 1
row.append(ranking)
if arena_subset_df is not None:
row.append(arena_df.iloc[i].get("ranking_difference") or 0)
row.append(model_name)
row.append(round(arena_df.iloc[i]["rating"]))
upper_diff = round(
arena_df.iloc[i]["rating_q975"] - arena_df.iloc[i]["rating"]
)
lower_diff = round(
arena_df.iloc[i]["rating"] - arena_df.iloc[i]["rating_q025"]
)
row.append(f"+{upper_diff}/-{lower_diff}")
row.append(round(arena_df.iloc[i]["num_battles"]))
row.append(
model_table_df[model_table_df["key"] == model_key][
"Organization"
].values[0]
)
row.append(
model_table_df[model_table_df["key"] == model_key]["License"].values[0]
)
cutoff_date = model_table_df[model_table_df["key"] == model_key][
"Knowledge cutoff date"
].values[0]
if cutoff_date == "-":
row.append("Unknown")
else:
row.append(cutoff_date)
values.append(row)
except Exception as e:
traceback.print_exc()
print(f"{model_key} - {e}")
return values
key_to_category_name = {
"full": "Overall",
"crowdsourcing/simple_prompts": "crowdsourcing/simple_prompts",
"site_visitors/medium_prompts": "site_visitors/medium_prompts",
"site_visitors/medium_prompts:style control": "site_visitors/medium_prompts:style control"
}
cat_name_to_explanation = {
"Overall": "Все запросы",
"crowdsourcing/simple_prompts": "Запросы, собранные с краудсорсинга. Преимущественно, простые.",
"site_visitors/medium_prompts": "Запросы от пользователей сайта. Содержат более сложные промпты.",
"site_visitors/medium_prompts:style control": "Запросы от пользователей сайта. Содержат более сложные промпты. [Снижено влияние стилистики](https://lmsys.org/blog/2024-08-28-style-control/) ответа на оценку."
}
cat_name_to_baseline = {
"Hard Prompts (English)": "English",
}
actual_categories = [
"Overall",
"crowdsourcing/simple_prompts",
"site_visitors/medium_prompts",
"site_visitors/medium_prompts:style control"
]
def read_elo_file(elo_results_file, leaderboard_table_file):
arena_dfs = {}
category_elo_results = {}
with open(elo_results_file, "rb") as fin:
elo_results = pickle.load(fin)
last_updated_time = None
if "full" in elo_results:
last_updated_time = elo_results["full"]["last_updated_datetime"].split(
" "
)[0]
for k in key_to_category_name.keys():
if k not in elo_results:
continue
arena_dfs[key_to_category_name[k]] = elo_results[k][
"leaderboard_table_df"
]
category_elo_results[key_to_category_name[k]] = elo_results[k]
data = load_leaderboard_table_csv(leaderboard_table_file)
model_table_df = pd.DataFrame(data)
return last_updated_time, arena_dfs, category_elo_results, elo_results, model_table_df
def build_leaderboard_tab(
elo_results_file, leaderboard_table_file, show_plot=False, mirror=False
):
arena_dfs = {}
arena_df = pd.DataFrame()
category_elo_results = {}
last_updated_time, arena_dfs, category_elo_results, elo_results, model_table_df = read_elo_file(elo_results_file, leaderboard_table_file)
p1 = category_elo_results["Overall"]["win_fraction_heatmap"]
p2 = category_elo_results["Overall"]["battle_count_heatmap"]
p3 = category_elo_results["Overall"]["bootstrap_elo_rating"]
p4 = category_elo_results["Overall"]["average_win_rate_bar"]
arena_df = arena_dfs["Overall"]
default_md = make_default_md_1()
default_md_2 = make_default_md_2()
with gr.Row():
with gr.Column(scale=4):
md_1 = gr.Markdown(default_md, elem_id="leaderboard_markdown")
with gr.Column(scale=1):
vote_button = gr.Button("Голосовать!", link="https://llmarena.ru")
md_2 = gr.Markdown(default_md_2, elem_id="leaderboard_markdown")
if leaderboard_table_file:
data = load_leaderboard_table_csv(leaderboard_table_file)
model_table_df = pd.DataFrame(data)
with gr.Tabs() as tabs:
arena_table_vals = get_arena_table(arena_df, model_table_df)
with gr.Tab("Арена", id=0):
md = make_arena_leaderboard_md(arena_df, last_updated_time)
lb_description = gr.Markdown(md, elem_id="leaderboard_markdown")
with gr.Row():
with gr.Column(scale=2):
category_dropdown = gr.Dropdown(
choices=actual_categories,
label="Category",
value="Overall",
)
default_category_details = make_category_arena_leaderboard_md(
arena_df, arena_df, name="Overall"
)
with gr.Column(scale=4, variant="panel"):
category_deets = gr.Markdown(
default_category_details, elem_id="category_deets"
)
arena_vals = pd.DataFrame(
arena_table_vals,
columns=[
"Rank* (UB)",
"Model",
"Arena Elo",
"95% CI",
"Votes",
"Organization",
"License",
"Knowledge Cutoff",
],
)
elo_display_df = gr.Dataframe(
headers=[
"Rank* (UB)",
"Model",
"Arena Elo",
"95% CI",
"Votes",
"Organization",
"License",
"Knowledge Cutoff",
],
datatype=[
"str",
"markdown",
"number",
"str",
"number",
"str",
"str",
"str",
],
value=arena_vals.style,
elem_id="arena_leaderboard_dataframe",
height=700,
column_widths=[70, 190, 100, 100, 90, 130, 150, 100],
wrap=True,
)
gr.Markdown(
elem_id="leaderboard_markdown",
)
leader_component_values[:] = [default_md, p1, p2, p3, p4]
if show_plot:
more_stats_md = gr.Markdown(
f"""## Больше статистики Чат-бот Арены""",
elem_id="leaderboard_header_markdown",
)
with gr.Row():
with gr.Column():
gr.Markdown(
"#### Figure 1: Confidence Intervals on Model Strength (via Bootstrapping)",
elem_id="plot-title",
)
plot_3 = gr.Plot(p3, show_label=False)
with gr.Column():
gr.Markdown(
"#### Figure 2: Average Win Rate Against All Other Models (Assuming Uniform Sampling and No Ties)",
elem_id="plot-title",
)
plot_4 = gr.Plot(p4, show_label=False)
with gr.Row():
with gr.Column():
gr.Markdown(
"#### Figure 3: Fraction of Model A Wins for All Non-tied A vs. B Battles",
elem_id="plot-title",
)
plot_1 = gr.Plot(
p1, show_label=False, elem_id="plot-container"
)
with gr.Column():
gr.Markdown(
"#### Figure 4: Battle Count for Each Combination of Models (without Ties)",
elem_id="plot-title",
)
plot_2 = gr.Plot(p2, show_label=False)
if not show_plot:
gr.Markdown(
"""
""",
elem_id="leaderboard_markdown",
)
else:
pass
def update_leaderboard_df(arena_table_vals):
elo_datarame = pd.DataFrame(
arena_table_vals,
columns=[
"Rank* (UB)",
"Delta",
"Model",
"Arena Elo",
"95% CI",
"Votes",
"Organization",
"License",
"Knowledge Cutoff",
],
)
def highlight_max(s):
return [
"color: green; font-weight: bold"
if "\u2191" in v
else "color: red; font-weight: bold"
if "\u2193" in v
else ""
for v in s
]
def highlight_rank_max(s):
return [
"color: green; font-weight: bold"
if v > 0
else "color: red; font-weight: bold"
if v < 0
else ""
for v in s
]
return elo_datarame.style.apply(highlight_max, subset=["Rank* (UB)"]).apply(
highlight_rank_max, subset=["Delta"]
)
def update_leaderboard_and_plots(category):
_, arena_dfs, category_elo_results, _ , model_table_df = read_elo_file(elo_results_file, leaderboard_table_file)
arena_subset_df = arena_dfs[category]
arena_subset_df = arena_subset_df[arena_subset_df["num_battles"] > 300]
elo_subset_results = category_elo_results[category]
baseline_category = cat_name_to_baseline.get(category, "Overall")
arena_df = arena_dfs[baseline_category]
arena_values = get_arena_table(
arena_df,
model_table_df,
arena_subset_df=arena_subset_df if category != "Overall" else None,
)
if category != "Overall":
arena_values = update_leaderboard_df(arena_values)
arena_values = gr.Dataframe(
headers=[
"Rank* (UB)",
"Delta",
"Model",
"Arena Elo",
"95% CI",
"Votes",
"Organization",
"License",
"Knowledge Cutoff",
],
datatype=[
"str",
"number",
"markdown",
"number",
"str",
"number",
"str",
"str",
"str",
],
value=arena_values,
elem_id="arena_leaderboard_dataframe",
height=700,
column_widths=[70, 70, 200, 90, 100, 90, 120, 150, 100],
wrap=True,
)
else:
arena_values = gr.Dataframe(
headers=[
"Rank* (UB)",
"Model",
"Arena Elo",
"95% CI",
"Votes",
"Organization",
"License",
"Knowledge Cutoff",
],
datatype=[
"str",
"markdown",
"number",
"str",
"number",
"str",
"str",
"str",
],
value=arena_values,
elem_id="arena_leaderboard_dataframe",
height=700,
column_widths=[70, 190, 100, 100, 90, 140, 150, 100],
wrap=True,
)
p1 = elo_subset_results["win_fraction_heatmap"]
p2 = elo_subset_results["battle_count_heatmap"]
p3 = elo_subset_results["bootstrap_elo_rating"]
p4 = elo_subset_results["average_win_rate_bar"]
more_stats_md = f"""## More Statistics for Chatbot Arena - {category}
"""
leaderboard_md = make_category_arena_leaderboard_md(
arena_df, arena_subset_df, name=category
)
return arena_values, p1, p2, p3, p4, more_stats_md, leaderboard_md
if leaderboard_table_file:
category_dropdown.change(
fn=update_leaderboard_and_plots,
inputs=[category_dropdown],
outputs=[
elo_display_df,
plot_1,
plot_2,
plot_3,
plot_4,
more_stats_md,
category_deets,
],
)
if show_plot and leaderboard_table_file:
return [md_1, md_2, lb_description, category_deets, elo_display_df, plot_1, plot_2, plot_3, plot_4]
return [md_1]
def build_demo(elo_results_file, leaderboard_table_file):
text_size = gr.themes.sizes.text_lg
theme = gr.themes.Default.load("theme.json")
theme.text_size = text_size
theme.set(
button_large_text_size="40px",
button_small_text_size="40px",
button_large_text_weight="1000",
button_small_text_weight="1000",
button_shadow="*shadow_drop_lg",
button_shadow_hover="*shadow_drop_lg",
checkbox_label_shadow="*shadow_drop_lg",
button_shadow_active="*shadow_inset",
button_secondary_background_fill="*primary_300",
button_secondary_background_fill_dark="*primary_700",
button_secondary_background_fill_hover="*primary_200",
button_secondary_background_fill_hover_dark="*primary_500",
button_secondary_text_color="*primary_800",
button_secondary_text_color_dark="white",
)
with gr.Blocks(
title="LLM арена: таблица лидеров",
theme=theme,
css=block_css,
) as demo:
build_leaderboard_tab(
elo_results_file, leaderboard_table_file, show_plot=True, mirror=True
)
return demo
block_css = """
#notice_markdown .prose {
font-size: 110% !important;
}
#notice_markdown th {
display: none;
}
#notice_markdown td {
padding-top: 6px;
padding-bottom: 6px;
}
#arena_leaderboard_dataframe table {
font-size: 110%;
}
#full_leaderboard_dataframe table {
font-size: 110%;
}
#model_description_markdown {
font-size: 110% !important;
}
#leaderboard_markdown .prose {
font-size: 110% !important;
}
#leaderboard_markdown td {
padding-top: 6px;
padding-bottom: 6px;
}
#leaderboard_dataframe td {
line-height: 0.1em;
}
#about_markdown .prose {
font-size: 110% !important;
}
#ack_markdown .prose {
font-size: 110% !important;
}
#chatbot .prose {
font-size: 105% !important;
}
.sponsor-image-about img {
margin: 0 20px;
margin-top: 20px;
height: 40px;
max-height: 100%;
width: auto;
float: left;
}
.chatbot h1, h2, h3 {
margin-top: 8px; /* Adjust the value as needed */
margin-bottom: 0px; /* Adjust the value as needed */
padding-bottom: 0px;
}
.chatbot h1 {
font-size: 130%;
}
.chatbot h2 {
font-size: 120%;
}
.chatbot h3 {
font-size: 110%;
}
.chatbot p:not(:first-child) {
margin-top: 8px;
}
.typing {
display: inline-block;
}
.cursor {
display: inline-block;
width: 7px;
height: 1em;
background-color: black;
vertical-align: middle;
animation: blink 1s infinite;
}
.dark .cursor {
display: inline-block;
width: 7px;
height: 1em;
background-color: white;
vertical-align: middle;
animation: blink 1s infinite;
}
@keyframes blink {
0%, 50% { opacity: 1; }
50.1%, 100% { opacity: 0; }
}
.app {
max-width: 100% !important;
padding: 20px !important;
}
a {
color: #1976D2; /* Your current link color, a shade of blue */
text-decoration: none; /* Removes underline from links */
}
a:hover {
color: #63A4FF; /* This can be any color you choose for hover */
text-decoration: underline; /* Adds underline on hover */
}
"""
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--share", action="store_true")
parser.add_argument("--host", default="0.0.0.0")
parser.add_argument("--port", type=int, default=7860)
args = parser.parse_args()
elo_result_files = glob.glob("elo_results_*.pkl")
elo_result_files.sort(key=lambda x: int(x[12:-4]))
elo_result_file = elo_result_files[-1]
leaderboard_table_files = glob.glob("leaderboard_table_*.csv")
leaderboard_table_files.sort(key=lambda x: int(x[18:-4]))
leaderboard_table_file = leaderboard_table_files[-1]
demo = build_demo(elo_result_file, leaderboard_table_file)
demo.launch(show_api=False)