Spaces:
Running
Running
from dataclasses import dataclass | |
from enum import Enum | |
class Task: | |
benchmark: str | |
metric: str | |
col_name: str | |
# Init: to update with your specific keys | |
class Tasks(Enum): | |
# task_key in the json file, metric_key in the json file, name to display in the leaderboard | |
task0 = Task("task_name1", "metric_name", "First task") | |
task1 = Task("task_name2", "metric_name", "Second task") | |
# Your leaderboard name | |
TITLE = """<h1 align="center" id="space-title">π Auto Arena of LLMs</h1>""" | |
# subtitle | |
SUB_TITLE = """<h2 align="center" id="space-title">Automating LLM Evaluations with Agent Peer-battles and Committee Discussions</h1>""" | |
# What does your leaderboard evaluate? | |
INTRODUCTION_TEXT = """ | |
This leaderboard is from a completely automated large language model (LLM) evaluation framework by employing various LLM agents in peer-battles and committee discussions. | |
You can find more details from the [project page](https://auto-arena.github.io/) and our [paper](). | |
""" | |
# For additional details such as datasets, evaluation criteria, and reproducibility, please refer to the "π About" tab. | |
# Stay tuned for the *SeaBench leaderboard* - focusing on evaluating the model's ability to respond to general human instructions in real-world multi-turn settings. | |
# """ | |
# Which evaluations are you running? how can people reproduce what you have? | |
LLM_BENCHMARKS_TEXT = f""" | |
``` | |
""" | |
# You can find the detailed numerical results in the results Hugging Face dataset: https://huggingface.co/datasets/SeaLLMs/SeaExam-results | |
EVALUATION_QUEUE_TEXT = """ | |
""" | |
CITATION_BUTTON_LABEL = "" | |
CITATION_BUTTON_TEXT = r""" | |
""" | |
CONTACT_TEXT = f""" | |
## Contact | |
""" | |