import argparse import ast import glob import pickle import traceback from datetime import datetime import pandas as pd import gradio as gr import numpy as np basic_component_values = [None] * 6 leader_component_values = [None] * 5 promo_banner = """
llmarena.ru - ИСПОЛЬЗУЙТЕ БЕСПЛАТНО ПОСЛЕДНИЕ ВЕРСИИ ЛУЧШИХ ЧАТ-БОТОВ НА РУССКОМ
""" def make_default_md_1(): leaderboard_md = f""" # 🏆 LLM арена на русском: таблица лидеров {promo_banner} """ return leaderboard_md def make_default_md_2(): leaderboard_md = f""" Платформа LLM Arena является открытой краудсорсинговой платформой для оценки больших языковых моделей (LLM) на русском языке. Мы собираем парные сравнения от людей, чтобы ранжировать LLM с помощью модели Брэдли-Терри и отображать рейтинги моделей по шкале Эло. Chatbot Arena на русском зависит от участия сообщества, пожалуйста, внесите свой вклад, отдав свой голос! - Чтобы **добавить свою модель** в сравнение - напишите нам в tg: [Группа](https://t.me/+bFEOl-Bdmok4NGUy) - Если вы **нашли ошибку**, либо у вас **есть предложение** - напишите нам: [Роман](https://t.me/roman_kucev) """ return leaderboard_md def make_arena_leaderboard_md(arena_df, last_updated_time): total_votes = sum(arena_df["num_battles"]) total_models = len(arena_df) space = "   " leaderboard_md = f""" Всего #моделей: **{total_models}**.{space} Всего #голосов: **{"{:,}".format(total_votes)}**.{space} Последнее обновление: {last_updated_time}. ***Ранг (UB)**: рейтинг модели (верхняя граница), определяется как один плюс количество моделей, которые статистически лучше целевой модели. Модель A статистически лучше модели B, когда нижняя граница оценки модели A больше верхней границы оценки модели B (с доверительным интервалом 95%). См. Рисунок 1 ниже для визуализации доверительных интервалов оценок моделей. """ return leaderboard_md def make_category_arena_leaderboard_md(arena_df, arena_subset_df, name="Overall"): total_votes = sum(arena_df["num_battles"]) total_models = len(arena_df) space = "   " total_subset_votes = sum(arena_subset_df["num_battles"]) total_subset_models = len(arena_subset_df) leaderboard_md = f"""### {cat_name_to_explanation[name]} #### {space} #модели: **{total_subset_models} ({round(total_subset_models/total_models *100)}%)** {space} #голоса: **{"{:,}".format(total_subset_votes)} ({round(total_subset_votes/total_votes * 100)}%)**{space} """ return leaderboard_md def model_hyperlink(model_name, link): return f'{model_name}' def load_leaderboard_table_csv(filename, add_hyperlink=True): lines = open(filename).readlines() heads = [v.strip() for v in lines[0].split(",")] rows = [] for i in range(1, len(lines)): row = [v.strip() for v in lines[i].split(",")] for j in range(len(heads)): item = {} for h, v in zip(heads, row): if h == "Arena Elo rating": if v != "-": v = int(ast.literal_eval(v)) else: v = np.nan elif h == "MMLU": if v != "-": v = round(ast.literal_eval(v) * 100, 1) else: v = np.nan elif h == "MT-bench (win rate %)": if v != "-": v = round(ast.literal_eval(v[:-1]), 1) else: v = np.nan elif h == "MT-bench (score)": if v != "-": v = round(ast.literal_eval(v), 2) else: v = np.nan item[h] = v if add_hyperlink: item["Model"] = model_hyperlink(item["Model"], item["Link"]) rows.append(item) return rows def create_ranking_str(ranking, ranking_difference): if ranking_difference > 0: return f"{int(ranking)} \u2191" elif ranking_difference < 0: return f"{int(ranking)} \u2193" else: return f"{int(ranking)}" def recompute_final_ranking(arena_df): # compute ranking based on CI ranking = {} for i, model_a in enumerate(arena_df.index): ranking[model_a] = 1 for j, model_b in enumerate(arena_df.index): if i == j: continue if ( arena_df.loc[model_b]["rating_q025"] > arena_df.loc[model_a]["rating_q975"] ): ranking[model_a] += 1 return list(ranking.values()) def get_arena_table(arena_df, model_table_df, arena_subset_df=None): arena_df = arena_df.sort_values( by=["final_ranking", "rating"], ascending=[True, False] ) arena_df["final_ranking"] = recompute_final_ranking(arena_df) arena_df = arena_df.sort_values( by=["final_ranking", "rating"], ascending=[True, False] ) # sort by rating if arena_subset_df is not None: # filter out models not in the arena_df arena_subset_df = arena_subset_df[arena_subset_df.index.isin(arena_df.index)] arena_subset_df = arena_subset_df.sort_values(by=["rating"], ascending=False) arena_subset_df["final_ranking"] = recompute_final_ranking(arena_subset_df) # keep only the models in the subset in arena_df and recompute final_ranking arena_df = arena_df[arena_df.index.isin(arena_subset_df.index)] # recompute final ranking arena_df["final_ranking"] = recompute_final_ranking(arena_df) # assign ranking by the order arena_subset_df["final_ranking_no_tie"] = range(1, len(arena_subset_df) + 1) arena_df["final_ranking_no_tie"] = range(1, len(arena_df) + 1) # join arena_df and arena_subset_df on index arena_df = arena_subset_df.join( arena_df["final_ranking"], rsuffix="_global", how="inner" ) arena_df["ranking_difference"] = ( arena_df["final_ranking_global"] - arena_df["final_ranking"] ) arena_df = arena_df.sort_values( by=["final_ranking", "rating"], ascending=[True, False] ) arena_df["final_ranking"] = arena_df.apply( lambda x: create_ranking_str(x["final_ranking"], x["ranking_difference"]), axis=1, ) arena_df["final_ranking"] = arena_df["final_ranking"].astype(str) values = [] for i in range(len(arena_df)): row = [] model_key = arena_df.index[i] try: model_name = model_table_df[model_table_df["key"] == model_key][ "Model" ].values[0] ranking = arena_df.iloc[i].get("final_ranking") or i + 1 row.append(ranking) if arena_subset_df is not None: row.append(arena_df.iloc[i].get("ranking_difference") or 0) row.append(model_name) row.append(round(arena_df.iloc[i]["rating"])) upper_diff = round( arena_df.iloc[i]["rating_q975"] - arena_df.iloc[i]["rating"] ) lower_diff = round( arena_df.iloc[i]["rating"] - arena_df.iloc[i]["rating_q025"] ) row.append(f"+{upper_diff}/-{lower_diff}") row.append(round(arena_df.iloc[i]["num_battles"])) row.append( model_table_df[model_table_df["key"] == model_key][ "Organization" ].values[0] ) row.append( model_table_df[model_table_df["key"] == model_key]["License"].values[0] ) cutoff_date = model_table_df[model_table_df["key"] == model_key][ "Knowledge cutoff date" ].values[0] if cutoff_date == "-": row.append("Unknown") else: row.append(cutoff_date) values.append(row) except Exception as e: traceback.print_exc() print(f"{model_key} - {e}") return values key_to_category_name = { "full": "Overall", "crowdsourcing/simple_prompts": "crowdsourcing/simple_prompts", "site_visitors/medium_prompts": "site_visitors/medium_prompts", "site_visitors/medium_prompts:style control": "site_visitors/medium_prompts:style control" } cat_name_to_explanation = { "Overall": "Все запросы", "crowdsourcing/simple_prompts": "Запросы, собранные с краудсорсинга. Преимущественно, простые.", "site_visitors/medium_prompts": "Запросы от пользователей сайта. Содержат более сложные промпты.", "site_visitors/medium_prompts:style control": "Запросы от пользователей сайта. Содержат более сложные промпты. [Снижено влияние стилистики](https://lmsys.org/blog/2024-08-28-style-control/) ответа на оценку." } cat_name_to_baseline = { "Hard Prompts (English)": "English", } actual_categories = [ "Overall", "crowdsourcing/simple_prompts", "site_visitors/medium_prompts", "site_visitors/medium_prompts:style control" ] def read_elo_file(elo_results_file, leaderboard_table_file): arena_dfs = {} category_elo_results = {} with open(elo_results_file, "rb") as fin: elo_results = pickle.load(fin) last_updated_time = None if "full" in elo_results: last_updated_time = elo_results["full"]["last_updated_datetime"].split( " " )[0] for k in key_to_category_name.keys(): if k not in elo_results: continue arena_dfs[key_to_category_name[k]] = elo_results[k][ "leaderboard_table_df" ] category_elo_results[key_to_category_name[k]] = elo_results[k] data = load_leaderboard_table_csv(leaderboard_table_file) model_table_df = pd.DataFrame(data) return last_updated_time, arena_dfs, category_elo_results, elo_results, model_table_df def build_leaderboard_tab( elo_results_file, leaderboard_table_file, show_plot=False, mirror=False ): arena_dfs = {} arena_df = pd.DataFrame() category_elo_results = {} last_updated_time, arena_dfs, category_elo_results, elo_results, model_table_df = read_elo_file(elo_results_file, leaderboard_table_file) p1 = category_elo_results["Overall"]["win_fraction_heatmap"] p2 = category_elo_results["Overall"]["battle_count_heatmap"] p3 = category_elo_results["Overall"]["bootstrap_elo_rating"] p4 = category_elo_results["Overall"]["average_win_rate_bar"] arena_df = arena_dfs["Overall"] default_md = make_default_md_1() default_md_2 = make_default_md_2() with gr.Row(): with gr.Column(scale=4): md_1 = gr.Markdown(default_md, elem_id="leaderboard_markdown") with gr.Column(scale=1): vote_button = gr.Button("Голосовать!", link="https://llmarena.ru") md_2 = gr.Markdown(default_md_2, elem_id="leaderboard_markdown") if leaderboard_table_file: data = load_leaderboard_table_csv(leaderboard_table_file) model_table_df = pd.DataFrame(data) with gr.Tabs() as tabs: arena_table_vals = get_arena_table(arena_df, model_table_df) with gr.Tab("Арена", id=0): md = make_arena_leaderboard_md(arena_df, last_updated_time) lb_description = gr.Markdown(md, elem_id="leaderboard_markdown") with gr.Row(): with gr.Column(scale=2): category_dropdown = gr.Dropdown( choices=actual_categories, label="Category", value="Overall", ) default_category_details = make_category_arena_leaderboard_md( arena_df, arena_df, name="Overall" ) with gr.Column(scale=4, variant="panel"): category_deets = gr.Markdown( default_category_details, elem_id="category_deets" ) arena_vals = pd.DataFrame( arena_table_vals, columns=[ "Rank* (UB)", "Model", "Arena Elo", "95% CI", "Votes", "Organization", "License", "Knowledge Cutoff", ], ) elo_display_df = gr.Dataframe( headers=[ "Rank* (UB)", "Model", "Arena Elo", "95% CI", "Votes", "Organization", "License", "Knowledge Cutoff", ], datatype=[ "str", "markdown", "number", "str", "number", "str", "str", "str", ], value=arena_vals.style, elem_id="arena_leaderboard_dataframe", height=700, column_widths=[70, 190, 100, 100, 90, 130, 150, 100], wrap=True, ) gr.Markdown( elem_id="leaderboard_markdown", ) leader_component_values[:] = [default_md, p1, p2, p3, p4] if show_plot: more_stats_md = gr.Markdown( f"""## Больше статистики Чат-бот Арены""", elem_id="leaderboard_header_markdown", ) with gr.Row(): with gr.Column(): gr.Markdown( "#### Figure 1: Confidence Intervals on Model Strength (via Bootstrapping)", elem_id="plot-title", ) plot_3 = gr.Plot(p3, show_label=False) with gr.Column(): gr.Markdown( "#### Figure 2: Average Win Rate Against All Other Models (Assuming Uniform Sampling and No Ties)", elem_id="plot-title", ) plot_4 = gr.Plot(p4, show_label=False) with gr.Row(): with gr.Column(): gr.Markdown( "#### Figure 3: Fraction of Model A Wins for All Non-tied A vs. B Battles", elem_id="plot-title", ) plot_1 = gr.Plot( p1, show_label=False, elem_id="plot-container" ) with gr.Column(): gr.Markdown( "#### Figure 4: Battle Count for Each Combination of Models (without Ties)", elem_id="plot-title", ) plot_2 = gr.Plot(p2, show_label=False) if not show_plot: gr.Markdown( """ """, elem_id="leaderboard_markdown", ) else: pass def update_leaderboard_df(arena_table_vals): elo_datarame = pd.DataFrame( arena_table_vals, columns=[ "Rank* (UB)", "Delta", "Model", "Arena Elo", "95% CI", "Votes", "Organization", "License", "Knowledge Cutoff", ], ) def highlight_max(s): return [ "color: green; font-weight: bold" if "\u2191" in v else "color: red; font-weight: bold" if "\u2193" in v else "" for v in s ] def highlight_rank_max(s): return [ "color: green; font-weight: bold" if v > 0 else "color: red; font-weight: bold" if v < 0 else "" for v in s ] return elo_datarame.style.apply(highlight_max, subset=["Rank* (UB)"]).apply( highlight_rank_max, subset=["Delta"] ) def update_leaderboard_and_plots(category): _, arena_dfs, category_elo_results, _ , model_table_df = read_elo_file(elo_results_file, leaderboard_table_file) arena_subset_df = arena_dfs[category] arena_subset_df = arena_subset_df[arena_subset_df["num_battles"] > 300] elo_subset_results = category_elo_results[category] baseline_category = cat_name_to_baseline.get(category, "Overall") arena_df = arena_dfs[baseline_category] arena_values = get_arena_table( arena_df, model_table_df, arena_subset_df=arena_subset_df if category != "Overall" else None, ) if category != "Overall": arena_values = update_leaderboard_df(arena_values) arena_values = gr.Dataframe( headers=[ "Rank* (UB)", "Delta", "Model", "Arena Elo", "95% CI", "Votes", "Organization", "License", "Knowledge Cutoff", ], datatype=[ "str", "number", "markdown", "number", "str", "number", "str", "str", "str", ], value=arena_values, elem_id="arena_leaderboard_dataframe", height=700, column_widths=[70, 70, 200, 90, 100, 90, 120, 150, 100], wrap=True, ) else: arena_values = gr.Dataframe( headers=[ "Rank* (UB)", "Model", "Arena Elo", "95% CI", "Votes", "Organization", "License", "Knowledge Cutoff", ], datatype=[ "str", "markdown", "number", "str", "number", "str", "str", "str", ], value=arena_values, elem_id="arena_leaderboard_dataframe", height=700, column_widths=[70, 190, 100, 100, 90, 140, 150, 100], wrap=True, ) p1 = elo_subset_results["win_fraction_heatmap"] p2 = elo_subset_results["battle_count_heatmap"] p3 = elo_subset_results["bootstrap_elo_rating"] p4 = elo_subset_results["average_win_rate_bar"] more_stats_md = f"""## More Statistics for Chatbot Arena - {category} """ leaderboard_md = make_category_arena_leaderboard_md( arena_df, arena_subset_df, name=category ) return arena_values, p1, p2, p3, p4, more_stats_md, leaderboard_md if leaderboard_table_file: category_dropdown.change( fn=update_leaderboard_and_plots, inputs=[category_dropdown], outputs=[ elo_display_df, plot_1, plot_2, plot_3, plot_4, more_stats_md, category_deets, ], ) if show_plot and leaderboard_table_file: return [md_1, md_2, lb_description, category_deets, elo_display_df, plot_1, plot_2, plot_3, plot_4] return [md_1] def build_demo(elo_results_file, leaderboard_table_file): text_size = gr.themes.sizes.text_lg theme = gr.themes.Default.load("theme.json") theme.text_size = text_size theme.set( button_large_text_size="40px", button_small_text_size="40px", button_large_text_weight="1000", button_small_text_weight="1000", button_shadow="*shadow_drop_lg", button_shadow_hover="*shadow_drop_lg", checkbox_label_shadow="*shadow_drop_lg", button_shadow_active="*shadow_inset", button_secondary_background_fill="*primary_300", button_secondary_background_fill_dark="*primary_700", button_secondary_background_fill_hover="*primary_200", button_secondary_background_fill_hover_dark="*primary_500", button_secondary_text_color="*primary_800", button_secondary_text_color_dark="white", ) with gr.Blocks( title="LLM арена: таблица лидеров", theme=theme, css=block_css, ) as demo: build_leaderboard_tab( elo_results_file, leaderboard_table_file, show_plot=True, mirror=True ) return demo block_css = """ #notice_markdown .prose { font-size: 110% !important; } #notice_markdown th { display: none; } #notice_markdown td { padding-top: 6px; padding-bottom: 6px; } #arena_leaderboard_dataframe table { font-size: 110%; } #full_leaderboard_dataframe table { font-size: 110%; } #model_description_markdown { font-size: 110% !important; } #leaderboard_markdown .prose { font-size: 110% !important; } #leaderboard_markdown td { padding-top: 6px; padding-bottom: 6px; } #leaderboard_dataframe td { line-height: 0.1em; } #about_markdown .prose { font-size: 110% !important; } #ack_markdown .prose { font-size: 110% !important; } #chatbot .prose { font-size: 105% !important; } .sponsor-image-about img { margin: 0 20px; margin-top: 20px; height: 40px; max-height: 100%; width: auto; float: left; } .chatbot h1, h2, h3 { margin-top: 8px; /* Adjust the value as needed */ margin-bottom: 0px; /* Adjust the value as needed */ padding-bottom: 0px; } .chatbot h1 { font-size: 130%; } .chatbot h2 { font-size: 120%; } .chatbot h3 { font-size: 110%; } .chatbot p:not(:first-child) { margin-top: 8px; } .typing { display: inline-block; } .cursor { display: inline-block; width: 7px; height: 1em; background-color: black; vertical-align: middle; animation: blink 1s infinite; } .dark .cursor { display: inline-block; width: 7px; height: 1em; background-color: white; vertical-align: middle; animation: blink 1s infinite; } @keyframes blink { 0%, 50% { opacity: 1; } 50.1%, 100% { opacity: 0; } } .app { max-width: 100% !important; padding: 20px !important; } a { color: #1976D2; /* Your current link color, a shade of blue */ text-decoration: none; /* Removes underline from links */ } a:hover { color: #63A4FF; /* This can be any color you choose for hover */ text-decoration: underline; /* Adds underline on hover */ } """ if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--share", action="store_true") parser.add_argument("--host", default="0.0.0.0") parser.add_argument("--port", type=int, default=7860) args = parser.parse_args() elo_result_files = glob.glob("elo_results_*.pkl") elo_result_files.sort(key=lambda x: int(x[12:-4])) elo_result_file = elo_result_files[-1] leaderboard_table_files = glob.glob("leaderboard_table_*.csv") leaderboard_table_files.sort(key=lambda x: int(x[18:-4])) leaderboard_table_file = leaderboard_table_files[-1] demo = build_demo(elo_result_file, leaderboard_table_file) demo.launch(show_api=False)